namespace ucstypes ------------------------------------------ -- Unicode Character Types -- The following is taken from AndroidUnicode.h, AndroidUnicode.cpp and -- characterData.h from the Andriod project -- -- * -- * Copyright (C) 2008 The Android Open Source Project -- * -- * Licensed under the Apache License, Version 2.0 (the "License"); -- * you may not use this file except in compliance with the License. -- * You may obtain a copy of the License at -- * -- * http://www.apache.org/licenses/LICENSE-2.0 -- * -- * Unless required by applicable law or agreed to in writing, software -- * distributed under the License is distributed on an "AS IS" BASIS, -- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -- * See the License for the specific language governing permissions and -- * limitations under the License. -- * include std/types.e include std/math.e include std/utils.e -- For Latin1 characters just index into this array to get the index and decomposition constant LATIN1_DATA = u" 0001 0001 0001 0001 0001 0001 0001 0001 0001 0002 0003 0002 0004 0003 0001 0001 0001 0001 0001 0001 0001 0001 0001 0001 0001 0001 0001 0001 0003 0003 0003 0002 0005 0006 0006 0007 0008 0007 0006 0006 0009 000A 0006 000B 000C 000D 000C 000C 000E 000F 0010 0011 0012 0013 0014 0015 0016 0017 000C 0006 0018 0019 001A 0006 0006 001B 001C 001D 001E 001F 0020 0021 0022 0023 0024 0025 0026 0027 0028 0029 002A 002B 002C 002D 002E 002F 0030 0031 0032 0033 0034 0035 0006 0036 0037 0038 0037 0039 003A 003B 003C 003D 003E 003F 0040 0041 0042 0043 0044 0045 0046 0047 0048 0049 004A 004B 004C 004D 004E 004F 0050 0051 0052 0035 0019 0036 0019 0001 0001 0001 0001 0001 0001 0003 0001 0001 0001 0001 0001 0001 0001 0001 0001 0001 0001 0001 0001 0001 0001 0001 0001 0001 0001 0001 0001 0001 0001 0001 0001 0001 5853 0006 0008 0008 0008 0008 0054 0054 1037 0054 7855 0056 0019 0057 0054 1037 0058 0059 785A 785B 1037 105C 0054 0006 1037 785D 7855 005E 305F 305F 305F 0006 0860 0860 0860 0860 0860 0860 0060 0860 0860 0860 0860 0860 0860 0860 0860 0860 0060 0860 0860 0860 0860 0860 0860 0019 0060 0860 0860 0860 0860 0860 0060 0055 0861 0861 0861 0861 0861 0861 0061 0861 0861 0861 0861 0861 0861 0861 0861 0861 0061 0861 0861 0861 0861 0861 0861 0019 0061 0861 0861 0861 0861 0861 0061 0862 " -- Each of these arrays is stripped into ranges. In order to build the arrays, each -- codepoint was bit-shifted so that even and odd characters were separated into different -- arrays. The identifier of each array is the top byte after bit-shifting. -- The numbers stored in the array are the bit-shifted codepoint, the decomposition, and an -- index into another array of all possible packed data values. The top 16 bits are the -- codepoint and the bottom 16 are the decomposition and index. The top 5 bits for the decomposition -- and the rest for the index. constant aconstant a1 = U"00000071 536C0078 7C000871 7D0F0078" constant a7 = U"00100057 00400078 00800083 00F80078 8000013F FFFF0078" constant a8 = U"0000013F 7FFF0078" constant aconstant a17 = U"00000071 536B0078 7C000871 7D0F0078" constant a23 = U"00000057 00010078 00100057 00400078 00800083 00F80078 8000013F FFFF0078" constant a24 = U"0000013F 7FFF0078" -- The full set of all arrays to be searched. constant FULL_DATA = { a0, a1, 0, 0, 0, 0, 0, a7, a8, 0, 0, 0, 0, 0, 0, 0, a16, a17, 0, 0, 0, 0, 0, a23, a24, 0, 0, 0, 0, 0, 0, 0 } constant UCDIFF = { 0, -32, 743, 121, -1, -232, -300, 97, 163, 130, 56, -2, -79, -210, -206, -205, -202, -203, -207, -209, -211, -213, -214, -218, -217, -219, -83, 84, -38, -37, -31, -64, -63, -62, -57, -47, -54, -86, -80, 7, -96, -48, -59, 8, 74, 86, 100, 128, 112, 126, 9, -7205, -16, -26, -7264, -40 } constant LCDIFF = { 0, 32, 1, -199, -121, 210, 206, 205, 79, 202, 203, 207, 211, 209, 213, 214, 218, 217, 219, 2, -97, -56, -130, -163, 83, 38, 37, 64, 63, -60, -7, 80, 48, 7264, -8, -74, -9, -86, -100, -112, -128, -126, -7517, -8383, -8262, 16, 26, 40 } constant TCDIFF = { 3, 1, 0, -1 } constant MIRROR_DIFF = { 0, 1, -1, 2, -2, 16, -16, 3, -3, 2016, 138, 1824, 2104, 2108, 2106, -138, 8, 7, -8, -7, -1824, -2016, -2104, -2106, -2108 } constant NUMERICS = { -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, -2, 100, 1000, 40, 50, 60, 70, 80, 90, 10000, 500, 5000, 36, 37, 38, 39, 41, 42, 43, 44, 45, 46, 47, 48, 49, 200, 300, 400, 600, 700, 800, 900, 2000, 3000, 4000, 6000, 7000, 8000, 9000, 20000, 30000, 40000, 50000, 60000, 70000, 80000, 90000 } constant NON_INTEGERS = u" 00BC 00BD 00BE 0F2A 0F2B 0F2C 0F2D 0F2E 0F2F 0F30 0F31 0F32 0F33 2153 2154 2155 2156 2157 2158 2159 215A 215B 215C 215D 215E 2CFD " constant NON_INTEGER_VALS = { 1/4, 1/2, 3/4, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1/3, 2/3, 1/5, 2/5, 3/5, 4/5, 1/6, 5/6, 1/8, 3/8, 5/8, 7/8, 1/2, $} -- Copied from java.lang.Character implementation: /* 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 F E D C B A 9 8 7 6 5 4 3 2 1 0 F E D C B A 9 8 7 6 5 4 3 2 1 0 --------- 31 types --------- 18 directionalities - 2 mirrored ----------- 56 toupper diffs ----------- 48 tolower diffs --- 4 totitlecase diffs ------------- 84 numeric values --------- 24 mirror char diffs */ constant PACKED_DATA = U" 00000000 0000012F 0000016F 0000014F 0000018F 0000018C 000001B8 000000B8 000000BA 020005B5 040005B6 00000099 000000F8 00000094 02000069 04000069 06000069 08000069 0A000069 0C000069 0E000069 10000069 12000069 14000069 060005B9 000001B9 080005B9 16020001 18020001 1A020001 1C020001 1E020001 20020001 22020001 24020001 26020001 28020001 2A020001 2C020001 2E020001 30020001 32020001 34020001 36020001 38020001 3A020001 3C020001 3E020001 40020001 42020001 44020001 46020001 48020001 060005B5 080005B6 000001BB 000001B7 16000802 18000802 1A000802 1C000802 1E000802 20000802 22000802 24000802 26000802 28000802 2A000802 2C000802 2E000802 30000802 32000802 34000802 36000802 38000802 3A000802 3C000802 3E000802 40000802 42000802 44000802 46000802 48000802 000000EC 000001BC 00000002 0A0005BD 00000130 000000BC 000000B9 0600006B 0800006B 00001002 0400006B 0C0005BE 4A0001AB 00020001 00000802 00001802 00040001 00060001 00002002 00080001 000C0001 000E0001 00100001 00140001 00160001 00180001 00004002 00004802 00200001 00220001 00000005 00A60001 01805802 01042003 00280001 002C0001 00000001 00000000 00007002 00007802 00009802 0000A802 0000B802 0000C002 0000C802 0000D002 00000004 000001A4 00000106 00320001 00340001 00360001 00380001 0000E002 0000E802 0000F002 0000F802 00010002 00010802 00012002 00012802 00013802 003A0001 003E0001 00013002 0000001C 00000107 00400001 00000018 00014802 000001B4 00000038 00000025 00000050 00000058 00000045 00000044 020000C9 060000C9 0A0000C9 0E0000C9 120000C9 000000D8 0000005C 00000008 02000009 06000009 0A000009 0E000009 12000009 0400000B 0800000B 0000000B 1600000B 4E00000B 00000006 4A00000B 000001B5 00420001 0600000B 0A00000B 0E00000B 1200000B 3E00000B 5200000B 5600000B 5A00000B 5C00000B 000001B6 2400000A 2800000A 00000010 020001AB 060001AB 0A0001AB 0E0001AB 120001AB 00000108 00015802 00440001 00016002 00016802 00017002 00017802 00018002 00018802 00440003 00460001 00480003 00019802 004A0001 004C0001 004E0001 003C0001 00500001 00520001 000001BD 0000018D 000001D0 00000250 00000230 040005BE 000000F9 0200006B 0A00006B 0E00006B 1200006B 00540001 00560001 000005B9 045A000A 085A000A 0C5A000A 105A000A 145A000A 185A000A 525A000A 5E5A000A 0401A00A 0801A00A 0C01A00A 1001A00A 1401A00A 1801A00A 5201A00A 5E01A00A 4E00000A 5C00000A 0E0005B9 100005B9 020005B9 040005B9 160005B9 180005B9 1A0005B9 200005B9 220005B9 240005B9 260005B9 040001AB 080001AB 0C0001AB 100001AB 140001AB 180001AB 1C0001AB 200001AB 240001AB 280001AB 0C00006B 1000006B 1400006B 1800006B 1C00006B 2000006B 2400006B 2800006B 005C001C 0001A81C 1A0001AB 1E0001AB 220001AB 260001AB 2A0001AB 160001AB 020005B6 100005B6 280005B9 2C0005B9 300005B9 0001B002 020005BD 0600000A 0A00000A 0E00000A 1200000A 1600000A 3E00000A 0C00000B 1000000B 1400000B 2E0001AB 320001AB 360001AB 3A0001AB 3E0001AB 420001AB 460001AB 640001AB 680001AB 6A0001AB 6E0001AB 720001AB 760001AB 7A0001AB 00000013 00000012 0000005A 000001B0 7C00000B 8000000B 8200000B 8600000B 8C00000B 6000000B 9200000B 9600000B 9800000B 9C00000B A000000B A400000B 4A0001AA 040001AA 520001AA 600001AA 0C0001AA 5E0001AA 160001AA 4C0001AA 4E0001AA 9E0001AA 060001AA 8800000A 2A0001AA 005E0001 0001B802 0400002B 0800002B 1600002B 4C00002B 00002802 00003002 000A0001 00120001 00003802 001A0001 001C0001 001E0001 00240001 00005002 00006002 002A0001 002E0001 00300001 00006802 00008002 00008802 00009002 0000A002 0000B002 0000D906 00011002 00011802 00014002 040000C9 080000C9 0C0000C9 100000C9 140000C9 04000009 08000009 0C000009 10000009 14000009 2200000B 4C00000B 2A00000B 5000000B 5400000B 5800000B 2600000A 00015002 00019002 00000030 000001BE 0000014E 00000210 000001F0 00580001 065A000A 0A5A000A 0E5A000A 125A000A 165A000A 1A5A000A 4C5A000A 4E5A000A 0601A00A 0A01A00A 0E01A00A 1201A00A 1601A00A 1A01A00A 4C01A00A 4E01A00A 6000000A 0000000A 120005B9 140005B9 1C0005B9 1E0005B9 1600006B 1A00006B 1E00006B 2200006B 2600006B 2A00006B 0E0005B5 040005B5 2A0005B9 2E0005B9 0200000A 0400000A 0800000A 0C00000A 1000000A 1400000A 2A00000A 2C0001AB 300001AB 340001AB 380001AB 3C0001AB 400001AB 440001AB 480001AB 620001AB 660001AB 500001AB 6C0001AB 700001AB 740001AB 780001AB 520001AB 7E00000B 5E00000B 8400000B 8800000B 8A00000B 8E00000B 9000000B 9400000B 9A00000B 9E00000B A200000B A600000B 5C0001AA 3E0001AA 7E0001AA 0600002B 0A00002B 2A00002B 4E00002B 00000019 " -- Character types as specified in the Unicode standard. -- These map directly to java.lang.Character. enum UNASSIGNED = 0, UPPERCASE_LETTER, LOWERCASE_LETTER, TITLECASE_LETTER, MODIFIER_LETTER, OTHER_LETTER, NONSPACING_MARK, ENCLOSING_MARK, COMBINING_SPACING_MARK, DECIMAL_DIGIT_NUMBER, LETTER_NUMBER, OTHER_NUMBER, SPACE_SEPARATOR, LINE_SEPARATOR, PARAGRAPH_SEPARATOR, CONTROL, FORMAT, NOT_ASSIGNED, PRIVATE_USE, SURROGATE, DASH_PUNCTUATION, OPEN_PUNCTUATION, CLOSE_PUNCTUATION, CONNECTOR_PUNCTUATION, OTHER_PUNCTUATION, MATH_SYMBOL, CURRENCY_SYMBOL, MODIFIER_SYMBOL, OTHER_SYMBOL, INITIAL_QUOTE_PUNCTUATION, FINAL_QUOTE_PUNCTUATION -- Decomposition types as described by the unicode standard. -- These values map to the same values in dchar.h in ICU. enum NONE = 0, CANONICAL, COMPAT, CIRCLE, FINAL, FONT, FRACTION, INITIAL, ISOLATED, MEDIAL, NARROW, NOBREAK, SMALL, SQUARE, SUB, SUPER, VERTICAL, WIDE enum TYPE_SHIFT = 0, TYPE_MASK = 0b1_1111, DIRECTION_SHIFT = 5, --TYPE_SHIFT + 5 DIRECTION_MASK = 0b1_1111, MIRRORED_SHIFT = 10, -- DIRECTION_SHIFT + 5 MIRRORED_MASK = 0b1, TOUPPER_SHIFT = 11, -- MIRRORED_SHIFT + 1 TOUPPER_MASK = 0b11_1111, TOLOWER_SHIFT = 17, -- TOUPPER_SHIFT + 6 TOLOWER_MASK = 0b11_1111, TOTITLE_SHIFT = 23, --TOLOWER_SHIFT + 6 TOTITLE_MASK = 0b11, MIRROR_SHIFT = 25, -- TOTITLE_SHIFT + 2 MIRROR_MASK = 0b1_1111, NUMERIC_SHIFT = 25, -- TOTITLE_SHIFT + 2 NUMERIC_MASK = 0b111_1111, DECOMPOSITION_SHIFT = 11, DECOMPOSITION_MASK = 0b1_1111 -- Directions specified in the Unicode standard. These directions map directly -- to java.lang.Character. enum UNDEFINED = -1, LEFT_TO_RIGHT, RIGHT_TO_LEFT, RIGHT_TO_LEFT_ARABIC, EUROPEAN_NUMBER, EUROPEAN_NUMBER_SEPARATOR, EUROPEAN_NUMBER_TERMINATOR, ARABIC_NUMBER, COMMON_NUMBER_SEPARATOR, DIR_NONSPACING_MARK, BOUNDARY_NEUTRAL, BLOCK_SEPARATOR, SEGMENT_SEPARATOR, WHITESPACE, OTHER_NEUTRALS, LEFT_TO_RIGHT_EMBEDDING, LEFT_TO_RIGHT_OVERRIDE, RIGHT_TO_LEFT_EMBEDDING, RIGHT_TO_LEFT_OVERRIDE, POP_DIRECTIONAL_FORMAT, $ function findCharacterValue(atom c) if c > 0x10FFFF then return -1 end if if c < 0 then return -1 end if if c <= 0xFF then return LATIN1_DATA[c + 1] end if -- Rotate the bits because the tables are separated into even and odd -- codepoints atom u = or_bits(shift_bits(c, 1), shift_bits(and_bits(c, 1), -20)) atom idx idx = shift_bits(u, 16) + 1 object array = FULL_DATA[idx] if atom(array) then return 0 end if -- This trick is so that that compare in the while loop does not need to -- shift the array entry down by 16 u = shift_bits(u, -16) u = or_bits(u, 0xFFFF) integer high = length(array) integer low = 1 while (low < high - 1) do integer probe = floor((high + low + 1) / 2) -- The entries contain the codepoint in the high 16 bits and the index -- into PACKED_DATA in the low 16. if array[probe] > u then high = probe else low = probe end if end while idx = array[low] if idx > u then return -1 end if return and_bits(array[low], 0xFFFF) end function function getPackedData(atom c) -- findCharacterValue returns a 16-bit value with the top 5 bits containing -- a decomposition type and the remaining bits containing an index. return PACKED_DATA[1 + and_bits(findCharacterValue(c), 0x7FF)] end function function getType(atom ucs_char) if ucs_char >= 0x10FFFF then return UNASSIGNED end if return and_bits(shift_bits(getPackedData(ucs_char), TYPE_SHIFT), TYPE_MASK) end function function getDirectionality(atom c) atom data = getPackedData(c) if not data then return UNDEFINED end if integer d = and_bits(shift_bits(data, DIRECTION_SHIFT), DIRECTION_MASK) return iff( d != DIRECTION_MASK, d , UNDEFINED) end function --** -- The code point is a 'letter'. -- -- Parameters: -- # ##ucs_char##: An atom. The code point to test. -- -- Returns: -- TRUE when ##ucs_char## is in the set, FALSE otherwise. -- public function isAlpha(atom ucs_char) if ucs_char <= 0x7F then return t_alpha(ucs_char) end if switch getType(ucs_char) do case UPPERCASE_LETTER, LOWERCASE_LETTER, TITLECASE_LETTER, MODIFIER_LETTER, OTHER_LETTER then return TRUE case else return FALSE end switch end function --** -- The code point is an uppercase 'letter'. -- -- Parameters: -- # ##ucs_char##: An atom. The code point to test. -- -- Returns: -- TRUE when ##ucs_char## is in the set, FALSE otherwise. -- public function isUpper(atom ucs_char) if ucs_char <= 0x7F then return t_upper(ucs_char) end if return (getType(ucs_char) = UPPERCASE_LETTER) end function --** -- The code point is a lowercase 'letter'. -- -- Parameters: -- # ##ucs_char##: An atom. The code point to test. -- -- Returns: -- TRUE when ##ucs_char## is in the set, FALSE otherwise. -- public function isLower(atom ucs_char) if ucs_char <= 0x7F then return t_lower(ucs_char) end if return (getType(ucs_char) = LOWERCASE_LETTER) end function --** -- The code point is a Title case 'letter'. -- -- Parameters: -- # ##ucs_char##: An atom. The code point to test. -- -- Returns: -- TRUE when ##ucs_char## is in the set, FALSE otherwise. -- public function isTitle(atom ucs_char) if ucs_char <= 0x7F then return t_upper(ucs_char) end if return (getType(ucs_char) = TITLECASE_LETTER) end function --** -- The code point is either a 'letter' or a 'digit'. -- -- Parameters: -- # ##ucs_char##: An atom. The code point to test. -- -- Returns: -- TRUE when ##ucs_char## is in the set, FALSE otherwise. -- public function isAlphaNum(atom ucs_char) if ucs_char <= 0x7F then return t_alnum(ucs_char) end if switch getType(ucs_char) do case UPPERCASE_LETTER, LOWERCASE_LETTER, TITLECASE_LETTER, MODIFIER_LETTER, OTHER_LETTER, DECIMAL_DIGIT_NUMBER then return TRUE case else return FALSE end switch end function --** -- The code point is a 'digit'. -- -- Parameters: -- # ##ucs_char##: An atom. The code point to test. -- -- Returns: -- TRUE when ##ucs_char## is in the set, FALSE otherwise. -- public function isDigit(atom ucs_char) if ucs_char <= 0x7F then return t_digit(ucs_char) end if switch getType(ucs_char) do case DECIMAL_DIGIT_NUMBER then return TRUE case else return FALSE end switch end function --** -- The code point is a 'number'. Some code points represent fractional numbers. -- -- Parameters: -- # ##ucs_char##: An atom. The code point to test. -- -- Returns: -- TRUE when ##ucs_char## is in the set, FALSE otherwise. -- public function isNumber(atom ucs_char) if ucs_char <= 0x7F then return t_digit(ucs_char) end if switch getType(ucs_char) do case DECIMAL_DIGIT_NUMBER, LETTER_NUMBER, OTHER_NUMBER then return TRUE case else return FALSE end switch end function --** -- The code point is a word separator. -- -- Parameters: -- # ##ucs_char##: An atom. The code point to test. -- -- Returns: -- TRUE when ##ucs_char## is in the set, FALSE otherwise. -- public function isSeparator(atom ucs_char) if ucs_char <= 0x7F then return t_space(ucs_char) end if switch getType(ucs_char) do case SPACE_SEPARATOR, LINE_SEPARATOR, PARAGRAPH_SEPARATOR then return TRUE case else return FALSE end switch end function --** -- The code point is a 'space' character. -- -- Parameters: -- # ##ucs_char##: An atom. The code point to test. -- -- Returns: -- TRUE when ##ucs_char## is in the set, FALSE otherwise. -- public function isSpace(atom ucs_char) if ucs_char <= 0x7F then return t_space(ucs_char) end if switch getType(ucs_char) do case SPACE_SEPARATOR then return TRUE case else return FALSE end switch end function --** -- The code point is a line separator. -- -- Parameters: -- # ##ucs_char##: An atom. The code point to test. -- -- Returns: -- TRUE when ##ucs_char## is in the set, FALSE otherwise. -- public function isLine(atom ucs_char) switch getType(ucs_char) do case LINE_SEPARATOR then return TRUE case else return FALSE end switch end function --** -- The code point is a paragraph separator -- -- Parameters: -- # ##ucs_char##: An atom. The code point to test. -- -- Returns: -- TRUE when ##ucs_char## is in the set, FALSE otherwise. -- public function isParagraph(atom ucs_char) switch getType(ucs_char) do case PARAGRAPH_SEPARATOR then return TRUE case else return FALSE end switch end function --** -- The code point is a textual 'marker'. -- -- Parameters: -- # ##ucs_char##: An atom. The code point to test. -- -- Returns: -- TRUE when ##ucs_char## is in the set, FALSE otherwise. -- public function isMark(atom ucs_char) switch getType(ucs_char) do case NONSPACING_MARK, COMBINING_SPACING_MARK, ENCLOSING_MARK then return TRUE case else return FALSE end switch end function --** -- The code point is a non-spacing letter. -- -- Parameters: -- # ##ucs_char##: An atom. The code point to test. -- -- Returns: -- TRUE when ##ucs_char## is in the set, FALSE otherwise. -- public function isNonSpacing(atom ucs_char) switch getType(ucs_char) do case NONSPACING_MARK then return TRUE case else return FALSE end switch end function --** -- The code point is a punctuation mark. -- -- Parameters: -- # ##ucs_char##: An atom. The code point to test. -- -- Returns: -- TRUE when ##ucs_char## is in the set, FALSE otherwise. -- public function isPunctuation(atom ucs_char) if ucs_char <= 0x7F then return t_punct(ucs_char) end if switch getType(ucs_char) do case CONNECTOR_PUNCTUATION, DASH_PUNCTUATION, OPEN_PUNCTUATION, CLOSE_PUNCTUATION, INITIAL_QUOTE_PUNCTUATION, FINAL_QUOTE_PUNCTUATION, OTHER_PUNCTUATION then return TRUE case else return FALSE end switch end function --** -- The code point is a 'symbol'. -- -- Parameters: -- # ##ucs_char##: An atom. The code point to test. -- -- Returns: -- TRUE when ##ucs_char## is in the set, FALSE otherwise. -- public function isSymbol(atom ucs_char) switch getType(ucs_char) do case MATH_SYMBOL, CURRENCY_SYMBOL, MODIFIER_SYMBOL, OTHER_SYMBOL then return TRUE case else return FALSE end switch end function --** -- The code point is a 'symbol'. -- -- Parameters: -- # ##ucs_char##: An atom. The code point to test. -- -- Returns: -- TRUE when ##ucs_char## is in the set, FALSE otherwise. -- public function isCurrency(atom ucs_char) return getType(ucs_char) = CURRENCY_SYMBOL end function --** -- The code point is a functional item, such as a control character, or private-use. -- -- Parameters: -- # ##ucs_char##: An atom. The code point to test. -- -- Returns: -- TRUE when ##ucs_char## is in the set, FALSE otherwise. -- public function isOther(atom ucs_char) switch getType(ucs_char) do case CONTROL, FORMAT, SURROGATE, PRIVATE_USE, NOT_ASSIGNED then return TRUE case else return FALSE end switch end function --** -- The code point is a control item. -- -- Parameters: -- # ##ucs_char##: An atom. The code point to test. -- -- Returns: -- TRUE when ##ucs_char## is in the set, FALSE otherwise. -- public function isControl(atom ucs_char) if ucs_char <= 0x7F then return t_cntrl(ucs_char) end if return getType(ucs_char) = CONTROL end function --** -- The code point is a formatting item. -- -- Parameters: -- # ##ucs_char##: An atom. The code point to test. -- -- Returns: -- TRUE when ##ucs_char## is in the set, FALSE otherwise. -- public function isFormat(atom ucs_char) return getType(ucs_char) = FORMAT end function --** -- The code point is a surrogate code. -- -- Parameters: -- # ##ucs_char##: An atom. The code point to test. -- -- Returns: -- TRUE when ##ucs_char## is in the set, FALSE otherwise. -- public function isSurrogate(atom ucs_char) return getType(ucs_char) = SURROGATE end function --** -- The code point is a private-use item. -- -- Parameters: -- # ##ucs_char##: An atom. The code point to test. -- -- Returns: -- TRUE when ##ucs_char## is in the set, FALSE otherwise. -- public function isPrivate(atom ucs_char) return getType(ucs_char) = PRIVATE_USE end function --** -- The code point is a character with a glyph. -- -- Parameters: -- # ##ucs_char##: An atom. The code point to test. -- -- Returns: -- TRUE when ##ucs_char## is in the set, FALSE otherwise. -- public function isGraph(atom ucs_char) if ucs_char <= 0x7F then return t_graph(ucs_char) end if switch getType(ucs_char) do case UPPERCASE_LETTER, LOWERCASE_LETTER, TITLECASE_LETTER, MODIFIER_LETTER, OTHER_LETTER, DECIMAL_DIGIT_NUMBER, LETTER_NUMBER, OTHER_NUMBER, NONSPACING_MARK, COMBINING_SPACING_MARK, ENCLOSING_MARK, CONNECTOR_PUNCTUATION, DASH_PUNCTUATION, OPEN_PUNCTUATION, CLOSE_PUNCTUATION, INITIAL_QUOTE_PUNCTUATION, FINAL_QUOTE_PUNCTUATION, OTHER_PUNCTUATION, MATH_SYMBOL, CURRENCY_SYMBOL, MODIFIER_SYMBOL, OTHER_SYMBOL then return TRUE case else return FALSE end switch end function --** -- The code point is a character that is printable. -- -- Parameters: -- # ##ucs_char##: An atom. The code point to test. -- -- Returns: -- TRUE when ##ucs_char## is in the set, FALSE otherwise. -- public function isPrint(atom ucs_char) if ucs_char <= 0x7F then return t_print(ucs_char) end if switch getType(ucs_char) do case UPPERCASE_LETTER, LOWERCASE_LETTER, TITLECASE_LETTER, MODIFIER_LETTER, OTHER_LETTER, DECIMAL_DIGIT_NUMBER, LETTER_NUMBER, OTHER_NUMBER, SPACE_SEPARATOR, NONSPACING_MARK, COMBINING_SPACING_MARK, ENCLOSING_MARK, CONNECTOR_PUNCTUATION, DASH_PUNCTUATION, OPEN_PUNCTUATION, CLOSE_PUNCTUATION, INITIAL_QUOTE_PUNCTUATION, FINAL_QUOTE_PUNCTUATION, OTHER_PUNCTUATION, MATH_SYMBOL, CURRENCY_SYMBOL, MODIFIER_SYMBOL, OTHER_SYMBOL then return TRUE case else return FALSE end switch end function --** -- The code point is a directional white space character. -- -- Parameters: -- # ##ucs_char##: An atom. The code point to test. -- -- Returns: -- TRUE when ##ucs_char## is in the set, FALSE otherwise. -- public function isDirWhiteSpace(atom ucs_char) return getDirectionality(ucs_char) = WHITESPACE end function --** -- The code point is a left-to-right character. -- -- Parameters: -- # ##ucs_char##: An atom. The code point to test. -- -- Returns: -- TRUE when ##ucs_char## is in the set, FALSE otherwise. -- public function isDirLTR(atom ucs_char) return getDirectionality(ucs_char) = LEFT_TO_RIGHT end function --** -- The code point is a right-to-left character. -- -- Parameters: -- # ##ucs_char##: An atom. The code point to test. -- -- Returns: -- TRUE when ##ucs_char## is in the set, FALSE otherwise. -- public function isDirRTL(atom ucs_char) return getDirectionality(ucs_char) = RIGHT_TO_LEFT end function --** -- The code point has an exact directional aspect. It must be always be LtR or RtL. -- -- Parameters: -- # ##ucs_char##: An atom. The code point to test. -- -- Returns: -- TRUE when ##ucs_char## is in the set, FALSE otherwise. -- public function isDirStrong(atom ucs_char) switch getDirectionality(ucs_char) do case RIGHT_TO_LEFT, LEFT_TO_RIGHT then return TRUE case else return FALSE end switch end function --** -- The code point has an implied directional aspect, that depends on context. -- -- Parameters: -- # ##ucs_char##: An atom. The code point to test. -- -- Returns: -- TRUE when ##ucs_char## is in the set, FALSE otherwise. -- public function isDirWeak(atom ucs_char) switch getDirectionality(ucs_char) do case EUROPEAN_NUMBER, EUROPEAN_NUMBER_SEPARATOR, EUROPEAN_NUMBER_TERMINATOR, ARABIC_NUMBER, COMMON_NUMBER_SEPARATOR then return TRUE case else return FALSE end switch end function --** -- The code point has no directional aspect. -- -- Parameters: -- # ##ucs_char##: An atom. The code point to test. -- -- Returns: -- TRUE when ##ucs_char## is in the set, FALSE otherwise. -- public function isDirNeutral(atom ucs_char) switch getDirectionality(ucs_char) do case BLOCK_SEPARATOR, SEGMENT_SEPARATOR, WHITESPACE, OTHER_NEUTRALS then return TRUE case else return FALSE end switch end function --** -- The code point is a directional separator. -- -- Parameters: -- # ##ucs_char##: An atom. The code point to test. -- -- Returns: -- TRUE when ##ucs_char## is in the set, FALSE otherwise. -- public function isDirSeparator(atom ucs_char) switch getDirectionality(ucs_char) do case BLOCK_SEPARATOR, SEGMENT_SEPARATOR then return TRUE case else return FALSE end switch end function --** -- The code point is a block separator. -- -- Parameters: -- # ##ucs_char##: An atom. The code point to test. -- -- Returns: -- TRUE when ##ucs_char## is in the set, FALSE otherwise. -- public function isBlock(atom ucs_char) return getDirectionality(ucs_char) = BLOCK_SEPARATOR end function --** -- The code point is a segment separator. -- -- Parameters: -- # ##ucs_char##: An atom. The code point to test. -- -- Returns: -- TRUE when ##ucs_char## is in the set, FALSE otherwise. -- public function isSegment(atom ucs_char) return getDirectionality(ucs_char) = SEGMENT_SEPARATOR end function --** -- The code point is a non-breaking character. -- -- Parameters: -- # ##ucs_char##: An atom. The code point to test. -- -- Returns: -- TRUE when ##ucs_char## is in the set, FALSE otherwise. -- public function isNonBreaking(atom ucs_char) return NOBREAK = and_bits( shift_bits(findCharacterValue(ucs_char), DECOMPOSITION_SHIFT), DECOMPOSITION_MASK) end function --** -- The code point has a mirror (matching) character, such as parenthesis. -- -- Parameters: -- # ##ucs_char##: An atom. The code point to test. -- -- Returns: -- TRUE when ##ucs_char## is in the set, FALSE otherwise. -- public function isMirroring(atom ucs_char) return and_bits(shift_bits(getPackedData(ucs_char), MIRRORED_SHIFT), MIRRORED_MASK) != 0 end function --** -- Converts the input to lower case. -- -- Parameters: -- # ##ucs_char##: An object. Either a single code point to convert or a text string. -- -- Returns: -- The converted input. -- public function toLower(object ucs_char) if atom(ucs_char) then return ucs_char + LCDIFF[1 + and_bits(shift_bits(getPackedData(ucs_char), TOLOWER_SHIFT), TOLOWER_MASK)] end if for i = 1 to length(ucs_char) do ucs_char[i] = toLower(ucs_char[i]) end for return ucs_char end function --** -- Converts the input to upperer case. -- -- Parameters: -- # ##ucs_char##: An object. Either a single code point to convert or a text string. -- -- Returns: -- The converted input. -- public function toUpper(object ucs_char) if atom(ucs_char) then return ucs_char + UCDIFF[1 + and_bits(shift_bits(getPackedData(ucs_char), TOUPPER_SHIFT), TOUPPER_MASK)] end if for i = 1 to length(ucs_char) do ucs_char[i] = toUpper(ucs_char[i]) end for return ucs_char end function --** -- Converts the input to lower case. -- -- Parameters: -- # ##ucs_char##: An atom. The code point to convert. -- -- Returns: -- The converted input. -- public function toTitle(atom ucs_char) integer diff = TCDIFF[1 + and_bits(shift_bits(getPackedData(ucs_char), TOTITLE_SHIFT), TOTITLE_MASK)] return iff( diff != TOTITLE_MASK, ucs_char + diff , toUpper(ucs_char)) end function --** -- Gets the matching charater for the supplied code point. -- -- Parameters: -- # ##ucs_char##: An atom. The code point to match. -- -- Returns: -- If the code point has no matching value, the code point is returned, otherwise -- the matching (mirror) code point is returned. -- public function toMirror(atom ucs_char) if not isMirroring(ucs_char) then return ucs_char end if return ucs_char + MIRROR_DIFF[1 + and_bits(shift_bits(getPackedData(ucs_char), MIRROR_SHIFT), MIRROR_MASK)] end function --** -- Gets the numerical value of the character. -- -- Parameters: -- # ##ucs_char##: An atom. The code point to convert. -- -- Returns: -- An atom: -1 if the code point has no numerical value, -- -2 if the value is not known, otherwise the numerical value of it. -- public function getNumericValue(atom ucs_char) if isMirroring(ucs_char) then return -1 end if atom val = NUMERICS[1 + and_bits(shift_bits(getPackedData(ucs_char), NUMERIC_SHIFT), NUMERIC_MASK)] if val = -2 then val = find(ucs_char, NON_INTEGERS) if val != 0 then val = NON_INTEGER_VALS[val] else val = -2 end if end if return val end function ifdef UCSTYPE_DEBUG then -- DEBUG TESTING -- This creates a file containing all the code ranges for each category. sequence p = { routine_id("isAlpha"), routine_id("isUpper"), routine_id("isLower"), routine_id("isTitle"), routine_id("isAlphaNum"), routine_id("isDigit"), routine_id("isNumber"), routine_id("isSeparator"), routine_id("isSpace"), routine_id("isLine"), routine_id("isParagraph"), routine_id("isMark"), routine_id("isNonSpacing"), routine_id("isPunctuation"), routine_id("isSymbol"), routine_id("isCurrency"), routine_id("isOther"), routine_id("isControl"), routine_id("isFormat"), routine_id("isSurrogate"), routine_id("isPrivate"), routine_id("isGraph"), routine_id("isPrint"), routine_id("isDirWhiteSpace"), routine_id("isDirLTR"), routine_id("isDirRTL"), routine_id("isDirStrong"), routine_id("isDirWeak"), routine_id("isDirNeutral"), routine_id("isDirSeparator"), routine_id("isNonBreaking"), routine_id("isMirroring"), $ } sequence r integer ix integer c integer fh = open("ucstest.txt", "w") for k = 1 to length(p) do c = p[k] r = {-1,-1} ix = 1 for i = 0 to 0xFFFF do integer res res = call_func(c, {i}) if res then if ix = 1 then r[1] = i ix = 2 else if r[2] = -1 then r[2] = i elsif r[2] != i-1 then printf(fh, "%2d %06x:%06x\n", c & r) r = {i, -1} else r[2] = i end if end if else if r[1] != -1 then if r[2] != -1 then printf(fh, "%2d %06x:%06x\n", c & r) else printf(fh, "%2d %06x\n", c & r[1]) end if end if r = {-1, -1} ix = 1 end if end for if r[1] != -1 then if r[2] != -1 then printf(fh, "%2d %06x:%06x\n", c & r) else printf(fh, "%2d %06x\n", c & r[1]) end if end if puts(fh, "\n\n") end for close(fh) end ifdef