SourcePro® API Reference Guide

 
List of all members | Public Types | Static Public Member Functions

Provides methods for querying the properties of Unicode characters. More...

#include <rw/i18n/RWUCharTraits.h>

Public Types

enum  BidirectionalCategory {
  NoBidirectionalCategory, BeginBidirectionalCategory, LeftToRight, RightToLeft,
  EuropeanNumber, EuropeanNumberSeparator, EuropeanNumberTerminator, ArabicNumber,
  CommonNumberSeparator, BlockSeparator, SegmentSeparator, WhiteSpaceNeutral,
  OtherNeutral, LeftToRightEmbedding, LeftToRightOverride, RightToLeftArabic,
  RightToLeftEmbedding, RightToLeftOverride, PopDirectionalFormat, DirNonSpacingMark,
  BoundaryNeutral, EndBidirectionalCategory
}
 
enum  Block {
  NoBlock, BeginBlock, BasicLatinBlock, Latin1SupplementBlock,
  LatinExtendedABlock, LatinExtendedBBlock, IpaExtensionsBlock, SpacingModifierLettersBlock,
  CombiningDiacriticalMarksBlock, GreekAndCopticBlock, CyrillicBlock, CyrillicSupplementalblock,
  ArmenianBlock, HebrewBlock, ArabicBlock, SyriacBlock,
  ThaanaBlock, DevanagariBlock, BengaliBlock, GurmukhiBlock,
  GujaratiBlock, OriyaBlock, TamilBlock, TeluguBlock,
  KannadaBlock, MalayalamBlock, SinhalaBlock, ThaiBlock,
  LaoBlock, TibetanBlock, MyanmarBlock, GeorgianBlock,
  HangulJamoBlock, EthiopicBlock, CherokeeBlock, UnifiedCanadianAboriginalSyllabicsBlock,
  OghamBlock, RunicBlock, TagalogBlock, HanunooBlock,
  BuhidBlock, TagbanwaBlock, KhmerBlock, MongolianBlock,
  LatinExtendedAdditionalBlock, GreekExtendedBlock, GeneralPunctuationBlock, SuperscriptsAndSubscriptsBlock,
  CurrencySymbolsBlock, CombiningDiacriticalMarksForSymbolsBlock, LetterlikeSymbolsBlock, NumberFormsBlock,
  ArrowsBlock, MathematicalOperatorsBlock, MiscellaneousTechnicalBlock, ControlPicturesBlock,
  OpticalCharacterRecognitionBlock, EnclosedAlphanumericsBlock, BoxDrawingBlock, BlockElementsBlock,
  GeometricShapesBlock, MiscellaneousSymbolsBlock, DingbatsBlock, MiscellaneousMathematicalSymbolsABlock,
  SupplementalArrowsABlock, BraillePatternsBlock, SupplementalArrowsBBlock, MiscellaneousMathematicalSymbolsBBlock,
  SupplementalMathematicalOperatorsBlock, CjkRadicalsSupplementBlock, KangxiRadicalsBlock, IdeographicDescriptionCharactersBlock,
  CjkSymbolsAndPunctuationBlock, HiraganaBlock, KatakanaBlock, BopomofoBlock,
  HangulCompatibilityJamoBlock, KanbunBlock, BopomofoExtendedBlock, KatakanaPhoneticExtensionsBlock,
  EnclosedCjkLettersAndMonthsBlock, CjkCompatibilityBlock, CjkUnifiedIdeographsExtensionABlock, CjkUnifiedIdeographsBlock,
  YiSyllablesBlock, YiRadicalsBlock, HangulSyllablesBlock, HighSurrogatesBlock,
  HighPrivateUseSurrogatesBlock, LowSurrogatesBlock, PrivateUseAreaBlock, CjkCompatibilityIdeographsBlock,
  AlphabeticPresentationFormsBlock, ArabicPresentationFormsABlock, VariationSelectorsBlock, CombiningHalfMarksBlock,
  CjkCompatibilityFormsBlock, SmallFormVariantsBlock, ArabicPresentationFormsBBlock, HalfwidthAndFullwidthFormsBlock,
  SpecialsBlock, OldItalicBlock, GothicBlock, DeseretBlock,
  ByzantineMusicalSymbolsBlock, MusicalSymbolsBlock, MathematicalAlphanumericSymbolsBlock, CjkUnifiedIdeographsExtensionBBlock,
  CjkCompatibilityIdeographsSupplementBlock, TagsBlock, PrivateUseBlock, SupplementaryPrivateUseAreaABlock,
  SupplementaryPrivateUseAreaBBlock, InvalidBlock, EndBlock
}
 
enum  CombiningClass {
  BeginCombiningClass, BaseEquivalent, Spacing, Nonspacing,
  Split, Enclosing, Reordrant, TibetanSubjoined,
  OverlayInterior, Nuktas, KanaVoicingMarks, Viramas,
  StartOfFixedPositionClasses, HebrewPointSheva, HebrewPointHatafSegol, HebrewPointHatafPatah,
  HebrewPointHatafQamats, HebrewPointHiriq, HebrewPointTsere, HebrewPointSegol,
  HebrewPointPatah, HebrewPointQamats, HebrewPointHolam, HebrewPointQubuts,
  HebrewPointDagishOrMapiq, HebrewPointMeteg, HebrewPointRafe, HebrewPointShinDot,
  HebrewPointSinDot, HebrewPointJudeoSpanishVarika, ArabicFathatan, ArabicDammatan,
  ArabicKasratan, ArabicFatha, ArabicDamma, ArabicKasra,
  ArabicShadda, ArabicSukun, ArabicLetterSuperscriptAlef, SyriacLetterSuperscriptAlaph,
  TeluguLengthMark, TeluguAiLengthMark, ThaiCharacterSaraU, ThaiCharacterSaraUu,
  ThaiCharacterMaiEk, ThaiCharacterMaiTho, ThaiCharacterMaiTri, ThaiCharacterMaiChattawa,
  LaoVowelSignU, LaoVowelSignUu, LaoToneMaiEk, LaoToneMaiTho,
  LaoToneMaiTi, LaoToneMaiCatawa, TibetanVowelSignAa, TibetanVowelSignI,
  TibetanVowelSignE, TibetanVowelSignEe, TibetanVowelSignO, TibetanVowelSignOo,
  TibetanVowelSignReversedI, TibetanVowelSignU, EndOfFixedPositionClasses, BelowLeftAttached,
  BelowAttached, BelowRightAttached, LeftAttached, RightAttached,
  AboveLeftAttached, AboveAttached, AboveRightAttached, BelowLeft,
  Below, BelowRight, Left, Right,
  AboveLeft, Above, AboveRight, DoubleBelow,
  DoubleAbove, IotaSubscript, EndCombiningClass, NoCombiningClass
}
 
enum  EastAsianWidth {
  NeutralWidth, AmbiguousWidth, HalfWidth, FullWidth,
  NarrowWidth, WideWidth
}
 
enum  GeneralCategory {
  BeginGeneralCategory, Unassigned, UppercaseLetter, LowercaseLetter,
  TitlecaseLetter, ModifierLetter, OtherLetter, NonSpacingMark,
  EnclosingMark, CombiningSpacingMark, DecimalDigitNumber, LetterNumber,
  OtherNumber, SpaceSeparator, LineSeparator, ParagraphSeparator,
  ControlChar, FormatChar, PrivateUseChar, Surrogate,
  DashPunctuation, StartPunctuation, EndPunctuation, ConnectorPunctuation,
  OtherPunctuation, MathSymbol, CurrencySymbol, ModifierSymbol,
  OtherSymbol, InitialPunctuation, FinalPunctuation, GeneralOtherTypes,
  EndGeneralCategory
}
 
enum  Script {
  BeginScript, Latin, Greek, Cyrillic,
  Armenian, Hebrew, Arabic, Syriac,
  Thaana, Devanagari, Bengali, Gurmukhi,
  Gujarati, Oriya, Tamil, Telugu,
  Kannada, Malayalam, Sinhala, Thai,
  Lao, Tibetan, Myanmar, Georgian,
  Hangul, Ethiopic, Cherokee, CanadianAboriginal,
  Ogham, Runic, Khmer, Mongolian,
  Hiragana, Katakana, Bopomofo, Han,
  Yi, OldItalic, Gothic, Deseret,
  Inherited, NoScript, InvalidScript, EndScript
}
 

Static Public Member Functions

static BidirectionalCategory getBidirectionalCategory (RWUChar32 cp)
 
static Block getBlock (RWUChar32 cp)
 
static RWUChar32 getChar32 (const char *name, bool isDeprecatedName=false)
 
static CombiningClass getCombiningClass (RWUChar32 cp)
 
static int32_t getDecimalValue (RWUChar32 cp)
 
static RWUChar32 getDigit (int32_t value, int8_t radix)
 
static EastAsianWidth getEastAsianWidth (RWUChar32 cp)
 
static GeneralCategory getGeneralCategory (RWUChar32 cp)
 
static RWUChar32 getMirror (RWUChar32 cp)
 
static RWCString getName (RWUChar32 cp, bool getDeprecatedName=false)
 
static int32_t getNumericValue (RWUChar32 cp, int8_t radix)
 
static Script getScript (RWUChar32 cp)
 
static const RWUChar32getWhitespace ()
 
static bool isCharacter (RWUChar32 cp)
 
static bool isControl (RWUChar32 cp)
 
static bool isDecimalDigit (RWUChar32 cp)
 
static bool isDefined (RWUChar32 cp)
 
static bool isDigit (RWUChar32 cp)
 
static bool isError (RWUChar32 cp)
 
static bool isHighSurrogate (RWUChar16 cu)
 
static bool isLetter (RWUChar32 cp)
 
static bool isLower (RWUChar32 cp)
 
static bool isLowSurrogate (RWUChar16 cu)
 
static bool isMirrored (RWUChar32 cp)
 
static bool isNumeric (RWUChar32 cp)
 
static bool isPunctuation (RWUChar32 cp)
 
static bool isSingle (RWUChar16 cu)
 
static bool isSpace (RWUChar32 cp)
 
static bool isSurrogate (RWUChar16 cu)
 
static bool isTitle (RWUChar32 cp)
 
static bool isUpper (RWUChar32 cp)
 
static bool isWhitespace (RWUChar32 cp)
 
static bool requiresSurrogatePair (RWUChar32 cp)
 
static RWUChar32 toLower (RWUChar32 cp)
 
static RWUChar32 toTitle (RWUChar32 cp)
 
static RWUChar32 toUpper (RWUChar32 cp)
 

Detailed Description

RWUCharTraits provides methods for querying the properties of Unicode characters.

The Unicode Standard defines a comprehensive set of properties for each code point in the Unicode character set. The set of properties and the values of those properties are specified by the Unicode Character Database:

http://www.unicode.org/onlinedat/online.html

that is published as part of the Unicode Standard:

http://www.unicode.org/standard/standard.html

Unicode character properties may be either normative or informative, as defined in Chapter 3, "Conformance", of the Unicode Standard:

RWUCharTraits provides access to both normative and informative properties of Unicode characters.

Member Enumeration Documentation

An enumeration used to describe the linguistic direction of a Unicode character code point. The values in this enumeration correspond to the bidirectional category property codes that appear in the file UnicodeData.txt of the Unicode Character Database, as described in

http://www.unicode.org/reports/tr44/

The bidirectional property is a normative property of the Unicode Standard.

See also
getBidirectionalCategory()
Enumerator
NoBidirectionalCategory 

 

BeginBidirectionalCategory 

 

LeftToRight 

L

RightToLeft 

R

EuropeanNumber 

EN

EuropeanNumberSeparator 

ES

EuropeanNumberTerminator 

ET

ArabicNumber 

AN

CommonNumberSeparator 

CS

BlockSeparator 

B

SegmentSeparator 

S

WhiteSpaceNeutral 

WS

OtherNeutral 

ON

LeftToRightEmbedding 

LRE

LeftToRightOverride 

LRO

RightToLeftArabic 

AL

RightToLeftEmbedding 

RLE

RightToLeftOverride 

RLO

PopDirectionalFormat 

PDF

DirNonSpacingMark 

NSM

BoundaryNeutral 

BN

EndBidirectionalCategory 

 

An enumeration used to identify the various Unicode character blocks. The values in this enumeration correspond to the block names that appear in the file Blocks.txt of the Unicode Character Database, as described in Chapter 14, "Code Charts", of the Unicode Standard:

http://www.unicode.org/standard/standard.html

The block property is an informative property of the Unicode Standard.

See also
getBlock(), getScript()
Enumerator
NoBlock 

 

BeginBlock 

 

BasicLatinBlock 

 

Latin1SupplementBlock 

 

LatinExtendedABlock 

 

LatinExtendedBBlock 

 

IpaExtensionsBlock 

 

SpacingModifierLettersBlock 

 

CombiningDiacriticalMarksBlock 

 

GreekAndCopticBlock 

 

CyrillicBlock 

 

CyrillicSupplementalblock 

Defined under Unicode 3.2

ArmenianBlock 

 

HebrewBlock 

 

ArabicBlock 

 

SyriacBlock 

 

ThaanaBlock 

 

DevanagariBlock 

 

BengaliBlock 

 

GurmukhiBlock 

 

GujaratiBlock 

 

OriyaBlock 

 

TamilBlock 

 

TeluguBlock 

 

KannadaBlock 

 

MalayalamBlock 

 

SinhalaBlock 

 

ThaiBlock 

 

LaoBlock 

 

TibetanBlock 

 

MyanmarBlock 

 

GeorgianBlock 

 

HangulJamoBlock 

 

EthiopicBlock 

 

CherokeeBlock 

 

UnifiedCanadianAboriginalSyllabicsBlock 

 

OghamBlock 

 

RunicBlock 

 

TagalogBlock 

 

HanunooBlock 

 

BuhidBlock 

 

TagbanwaBlock 

 

KhmerBlock 

 

MongolianBlock 

 

LatinExtendedAdditionalBlock 

 

GreekExtendedBlock 

 

GeneralPunctuationBlock 

 

SuperscriptsAndSubscriptsBlock 

 

CurrencySymbolsBlock 

 

CombiningDiacriticalMarksForSymbolsBlock 

 

LetterlikeSymbolsBlock 

 

NumberFormsBlock 

 

ArrowsBlock 

 

MathematicalOperatorsBlock 

 

MiscellaneousTechnicalBlock 

 

ControlPicturesBlock 

 

OpticalCharacterRecognitionBlock 

 

EnclosedAlphanumericsBlock 

 

BoxDrawingBlock 

 

BlockElementsBlock 

 

GeometricShapesBlock 

 

MiscellaneousSymbolsBlock 

 

DingbatsBlock 

 

MiscellaneousMathematicalSymbolsABlock 

 

SupplementalArrowsABlock 

 

BraillePatternsBlock 

 

SupplementalArrowsBBlock 

 

MiscellaneousMathematicalSymbolsBBlock 

 

SupplementalMathematicalOperatorsBlock 

 

CjkRadicalsSupplementBlock 

 

KangxiRadicalsBlock 

 

IdeographicDescriptionCharactersBlock 

 

CjkSymbolsAndPunctuationBlock 

 

HiraganaBlock 

 

KatakanaBlock 

 

BopomofoBlock 

 

HangulCompatibilityJamoBlock 

 

KanbunBlock 

 

BopomofoExtendedBlock 

 

KatakanaPhoneticExtensionsBlock 

 

EnclosedCjkLettersAndMonthsBlock 

 

CjkCompatibilityBlock 

 

CjkUnifiedIdeographsExtensionABlock 

 

CjkUnifiedIdeographsBlock 

 

YiSyllablesBlock 

 

YiRadicalsBlock 

 

HangulSyllablesBlock 

 

HighSurrogatesBlock 

 

HighPrivateUseSurrogatesBlock 

 

LowSurrogatesBlock 

 

PrivateUseAreaBlock 

 

CjkCompatibilityIdeographsBlock 

 

AlphabeticPresentationFormsBlock 

 

ArabicPresentationFormsABlock 

 

VariationSelectorsBlock 

 

CombiningHalfMarksBlock 

 

CjkCompatibilityFormsBlock 

 

SmallFormVariantsBlock 

 

ArabicPresentationFormsBBlock 

 

HalfwidthAndFullwidthFormsBlock 

 

SpecialsBlock 

 

OldItalicBlock 

 

GothicBlock 

 

DeseretBlock 

 

ByzantineMusicalSymbolsBlock 

 

MusicalSymbolsBlock 

 

MathematicalAlphanumericSymbolsBlock 

 

CjkUnifiedIdeographsExtensionBBlock 

 

CjkCompatibilityIdeographsSupplementBlock 

 

TagsBlock 

 

PrivateUseBlock 

 

SupplementaryPrivateUseAreaABlock 

 

SupplementaryPrivateUseAreaBBlock 

 

InvalidBlock 

 

EndBlock 

 

An enumeration used to identify the combining class property to which each Unicode combining character belongs. See Chapter 3, "Conformance", and Chapter 4, "Character Properties", in the Unicode Standard:

http://www.unicode.org/standard/standard.html

The combining class property is a normative property of the Unicode Standard. The numeric value of each combining class and the number of classes are not normative and may change in future versions of the Unicode Standard.

Note
Some of the combining classes in this enumeration do not currently describe any Unicode characters but are specified here for completeness.
See also
getCombiningClass()
Enumerator
BeginCombiningClass 

 

BaseEquivalent 

 

Spacing 

 

Nonspacing 

 

Split 

 

Enclosing 

 

Reordrant 

 

TibetanSubjoined 

 

OverlayInterior 

 

Nuktas 

 

KanaVoicingMarks 

 

Viramas 

 

StartOfFixedPositionClasses 

 

HebrewPointSheva 

 

HebrewPointHatafSegol 

 

HebrewPointHatafPatah 

 

HebrewPointHatafQamats 

 

HebrewPointHiriq 

 

HebrewPointTsere 

 

HebrewPointSegol 

 

HebrewPointPatah 

 

HebrewPointQamats 

 

HebrewPointHolam 

 

HebrewPointQubuts 

 

HebrewPointDagishOrMapiq 

 

HebrewPointMeteg 

 

HebrewPointRafe 

 

HebrewPointShinDot 

 

HebrewPointSinDot 

 

HebrewPointJudeoSpanishVarika 

 

ArabicFathatan 

 

ArabicDammatan 

 

ArabicKasratan 

 

ArabicFatha 

 

ArabicDamma 

 

ArabicKasra 

 

ArabicShadda 

 

ArabicSukun 

 

ArabicLetterSuperscriptAlef 

 

SyriacLetterSuperscriptAlaph 

 

TeluguLengthMark 

 

TeluguAiLengthMark 

 

ThaiCharacterSaraU 

 

ThaiCharacterSaraUu 

 

ThaiCharacterMaiEk 

 

ThaiCharacterMaiTho 

 

ThaiCharacterMaiTri 

 

ThaiCharacterMaiChattawa 

 

LaoVowelSignU 

 

LaoVowelSignUu 

 

LaoToneMaiEk 

 

LaoToneMaiTho 

 

LaoToneMaiTi 

 

LaoToneMaiCatawa 

 

TibetanVowelSignAa 

 

TibetanVowelSignI 

 

TibetanVowelSignE 

 

TibetanVowelSignEe 

 

TibetanVowelSignO 

 

TibetanVowelSignOo 

 

TibetanVowelSignReversedI 

 

TibetanVowelSignU 

 

EndOfFixedPositionClasses 

 

BelowLeftAttached 

 

BelowAttached 

 

BelowRightAttached 

 

LeftAttached 

 

RightAttached 

 

AboveLeftAttached 

 

AboveAttached 

 

AboveRightAttached 

 

BelowLeft 

 

Below 

 

BelowRight 

 

Left 

 

Right 

 

AboveLeft 

 

Above 

 

AboveRight 

 

DoubleBelow 

 

DoubleAbove 

 

IotaSubscript 

 

EndCombiningClass 

 

NoCombiningClass 

 

An enumeration used to describe the display cell width of characters in East Asian text. The values in this enumeration correspond to the East Asian width property values defined in the EastAsianWidth.txt file Unicode Character Database (UCD), as described in Unicode Standard Annex #11, "East Asian Width":

http://www.unicode.org/reports/tr11

In accordance with changes to the Unicode specification for East Asian Width values, the ICU implementation changed between versions 2.4 and 2.6. Prior to ICU 2.6 (June 2003), the SourcePro Core Internationalization Module used the following declaration for EastAsianWidth:

Note
The Internationalization Module adopts the set of values that is supported by the underlying ICU implementation.
Enumerator
NeutralWidth 

A value used to identify characters that do not occur in legacy East Asian character sets. By extension, they also do not occur in East Asian typography.

AmbiguousWidth 

All characters that can be sometimes wide and sometimes narrow. Ambiguous characters occur in East Asian legacy character sets as wide characters, but as narrow characters in non-EastAsian usage. This set includes

  • Greek and Cyrillic alphabet symbols found in East Asian character sets
  • Some mathematical symbols
  • Private Use characters
HalfWidth 

All characters that are explicitly defined as HALF WIDTH in the Unicode Standard by having a compatibility decomposition of type <narrow> to characters elsewhere in the Unicode Standard that are implicitly wide but unmarked, plus the WON SIGN.

FullWidth 

All characters that are defined as FULL WIDTH in the Unicode Standard by having a compatibility decomposition of type <wide> to characters elsewhere in the Unicode Standard that are implicitly narrow but unmarked.

NarrowWidth 

All other characters that are always narrow and have explicit full-width or wide counterparts. These characters are implicitly narrow in East Asian typography and legacy character sets since they have explicit full-width or wide counterparts. All of US-ASCII is an example of East Asian Narrow characters.

WideWidth 

All other characters that are always wide. These characters occur only in the context of East Asian typography where they are wide characters (such as the Unified Han Ideographs or Squared Katakana Symbols). This category includes characters that have explicit half-width counterparts.

An enumeration of the general categories used to classify Unicode characters. The values in this enumeration correspond to the general category property codes that appear in the UnicodeData.txt file of the Unicode Character Database, as described in:

http://www.unicode.org/reports/tr44/

See also
getGeneralCategory()
Enumerator
BeginGeneralCategory 

 

Unassigned 

 

UppercaseLetter 

Lu (Normative)

LowercaseLetter 

Ll (Normative)

TitlecaseLetter 

Lt (Normative)

ModifierLetter 

Lm (Informative)

OtherLetter 

Lo (Informative)

NonSpacingMark 

Mn (Normative)

EnclosingMark 

Me (Normative)

CombiningSpacingMark 

Mc (Normative)

DecimalDigitNumber 

Nd (Normative)

LetterNumber 

Nl (Normative)

OtherNumber 

No (Normative)

SpaceSeparator 

Zs (Normative)

LineSeparator 

Zl (Normative)

ParagraphSeparator 

Zp (Normative)

ControlChar 

Cc (Normative)

FormatChar 

Cf (Normative)

PrivateUseChar 

Co (Normative)

Surrogate 

Cs (Normative)

DashPunctuation 

Pd (Informative)

StartPunctuation 

Ps (Informative)

EndPunctuation 

Pe (Informative)

ConnectorPunctuation 

Pc (Informative)

OtherPunctuation 

Po (Informative)

MathSymbol 

Sm (Informative)

CurrencySymbol 

Sc (Informative)

ModifierSymbol 

Sk (Informative)

OtherSymbol 

So (Informative)

InitialPunctuation 

Pi (Informative)

FinalPunctuation 

Pf (Informative)

GeneralOtherTypes 

Cn (Normative)

EndGeneralCategory 

 

An enumeration used to identify Unicode scripts. The values in this enumeration correspond to the script property names defined in the Scripts.txt file of the Unicode Character Database, as described in Unicode Technical Report #24, "Script Names":

http://www.unicode.org/reports/tr24

The script property is an informative property of the Unicode Standard.

See also
getScript()
Enumerator
BeginScript 

 

Latin 

 

Greek 

 

Cyrillic 

 

Armenian 

 

Hebrew 

 

Arabic 

 

Syriac 

 

Thaana 

 

Devanagari 

 

Bengali 

 

Gurmukhi 

 

Gujarati 

 

Oriya 

 

Tamil 

 

Telugu 

 

Kannada 

 

Malayalam 

 

Sinhala 

 

Thai 

 

Lao 

 

Tibetan 

 

Myanmar 

 

Georgian 

 

Hangul 

 

Ethiopic 

 

Cherokee 

 

CanadianAboriginal 

 

Ogham 

 

Runic 

 

Khmer 

 

Mongolian 

 

Hiragana 

 

Katakana 

 

Bopomofo 

 

Han 

 

Yi 

 

OldItalic 

 

Gothic 

 

Deseret 

 

Inherited 

 

NoScript 

 

InvalidScript 

 

EndScript 

 

Member Function Documentation

RWUCharTraits::BidirectionalCategory RWUCharTraits::getBidirectionalCategory ( RWUChar32  cp)
inlinestatic

Returns the value of the bidirectional category property for the Unicode character whose code point is equal to cp. A value of NoBidirectionalCategory is returned if cp is not a valid character. The directional type of a character is defined in the Unicode Character Database:

http://www.unicode.org/reports/tr44/

The bidirectional property is a normative property of the Unicode Standard.

RWUCharTraits::Block RWUCharTraits::getBlock ( RWUChar32  cp)
inlinestatic

Returns the value in the Block enumeration that identifies the character block which contains the Unicode character whose code point is equal to cp. A value of NoBlock is returned if cp is not a valid character. The character block is determined using information in the Blocks.txt file of the Unicode Character Database:

http://www.unicode.org/reports/tr44/

as described in Chapter 14, "Code Charts" of the Unicode Standard:

http://www.unicode.org/standard/standard.html

The block property is an informative property of the Unicode Standard.

RWUChar32 RWUCharTraits::getChar32 ( const char *  name,
bool  isDeprecatedName = false 
)
inlinestatic

Returns the code point of the Unicode character whose character name property value is equal to the value given for name. Returns a value of 0xFFFF, if no character whose name is name exists.

Each character may have two different names; a deprecated name, as originally defined by Unicode 1.0, and a standard name, as defined in subsequent versions of the standard. These names are defined in the UnicodeData.txt file of the Unicode Character Database:

http://www.unicode.org/reports/tr44/

isDeprecatedName equals true indicates that name is a character name defined in Version 1.0 of the Unicode Standard; false indicates that name is a character name defined in Version 2.0 and subsequent versions of the Unicode Standard. The default value is false.

The character name property is a normative property of the Unicode Standard.

Exceptions
RWUExceptionThrown if an error occurs.
RWUCharTraits::CombiningClass RWUCharTraits::getCombiningClass ( RWUChar32  cp)
inlinestatic

Returns the value of the combining class property for the Unicode character whose code point is cp. Each combining character is assigned a number that identifies the set containing all of the combining characters it typographically interacts with. See Chapter 3, "Conformance", of the Unicode Standard:

http://www.unicode.org/standard/standard.html

The combining class property is a normative property of the Unicode Standard. The numeric value of each combining class and the number of classes are not normative and may change in future versions of the Unicode Standard. The relative ordering of the classes is normative.

int32_t RWUCharTraits::getDecimalValue ( RWUChar32  cp)
inlinestatic

Returns the value of the decimal digit property for the Unicode character whose code point is equal to cp, if that character possesses the decimal digit property; otherwise, -1. The numeric value of a decimal digit is determined using information in the file UnicodeData.txt as described in:

http://www.unicode.org/reports/tr44/

See Chapter 3, "Conformance", and Chapter 4, "Character Properties", of the Unicode Standard:

http://www.unicode.org/standard/standard.html

The decimal digit value property is a normative property of the Unicode Standard.

RWUChar32 RWUCharTraits::getDigit ( int32_t  value,
int8_t  radix 
)
inlinestatic

Returns the RWUChar32 code point corresponding to the US-ASCII character in the range [0-9a-z]. Converts an integer value into a corresponding US-ASCII alphanumeric digit, given some radix in the range 2-36. value must be in the range 0 <= digit <= radix. For example, RWUCharTraits::getDigit(15, 16) returns the code point for f (U+0066).

In case of invalid value or invalid radix or the combination, returns null.

RWUCharTraits::EastAsianWidth RWUCharTraits::getEastAsianWidth ( RWUChar32  cp)
inlinestatic

Returns the value of the East Asian width property for the Unicode character whose code point is equal to cp. The display cell width of an East-Asian character is determined using information in the file EastAsianWidth.txt of the Unicode Character Database:

http://www.unicode.org/reports/tr44/

The Unicode Standard contains a complete set of precomposed modern Hangul syllables, as well as the set of conjoining jamo characters required to encode both modern and archaic syllable blocks. The Hangul jamo characters can be organized into the following classes:

  • choseong - the leading consonants or initial characters of a syllable.
  • jungseong - the vowels or peak characters of a syllable.
  • jongseong - the trailing consonants or final characters of a syllable.

When used with Korean Hangul, this function treats choseong jamo as double-width characters, and jungseong and jongseong jamo as zero-width characters. The result is correct only if the specified character is a precomposed choseong character and not part of a sequence of two choseong characters.

The East Asian width property is an informative property of the Unicode Standard. For more information, see Unicode Technical Report #11, "East Asian Character Width":

http://www.unicode.org/reports/tr11/

and Chapter 10, "East Asian Scripts", in the Unicode Standard:

http://www.unicode.org/standard/standard.html

RWUCharTraits::GeneralCategory RWUCharTraits::getGeneralCategory ( RWUChar32  cp)
inlinestatic

Returns the value of the general category property for the Unicode character whose code point is cp. Character categories are assigned in the file UnicodeData.txt of the Unicode Character Database:

http://www.unicode.org/reports/tr44/

The general category property is a normative property of the Unicode Standard. For more information, see Chapter 4 "Character Properties" in the Unicode Standard:

http://www.unicode.org/standard/standard.html

RWUChar32 RWUCharTraits::getMirror ( RWUChar32  cp)
inlinestatic

Returns the code point of the character that provides a "mirror-image" of the Unicode character whose code point is cp, as defined by the Unicode Standard, or the value of cp, if no such character exists. The mirror for cp is determined using information in the file BidiMirroring.txt of the Unicode Character Database:

http://www.unicode.org/reports/tr44/

as described in Unicode Standard Annex #9, "The Bidirectional Algorithm":

http://www.unicode.org/reports/tr9/

For more information, see Chapter 4 "Character Properties" in the Unicode Standard:

http://www.unicode.org/standard/standard.html

See also
isMirrored()
RWCString RWUCharTraits::getName ( RWUChar32  cp,
bool  getDeprecatedName = false 
)
inlinestatic

Returns the value of the character name property for the Unicode character whose code point is cp.

The name property is a normative property of the Unicode Standard.

int32_t RWUCharTraits::getNumericValue ( RWUChar32  cp,
int8_t  radix 
)
inlinestatic

Converts the code point cp of a digit character in the specified radix into an equivalent integer value.

Returns the numeric value of the character cp as follows:

  • If radix is not the range 2-36, or if the value of cp is not a valid digit in that radix, then -1 is returned.
  • If the character cp is listed as a digit in the UnicodeData.txt file of the Unicode Character Database, and the decimal value given for that digit is less than the specified radix, then the decimal value of that digit is returned.
  • If the character cp is one of the uppercase Latin letters in the range A-Z and its code point value is less than (radix + 'A' - 10), then the value of the expression (cp - 'A' + 10) is returned.
  • If the character cp is one of the lowercase Latin letters in the range a-z and its code point value is less than (radix + 'a' - 10), then the value of the expression (cp - 'a' + 10) is returned.
static Script RWUCharTraits::getScript ( RWUChar32  cp)
static

Returns the value of the script property of the Unicode character whose code point is cp.

The script property is an informative property of the Unicode Standard.

const RWUChar32 * RWUCharTraits::getWhitespace ( )
inlinestatic

Returns a null-terminated array of code points that are reasonably considered to be a word break or whitespace. The list of such code points is provided only as a convenience for use as delimiters (see RWUTokenizer).

Whitespace is not a property of the Unicode standard.

See also
isWhitespace()
bool RWUCharTraits::isCharacter ( RWUChar32  cp)
inlinestatic

Returns true if cp is a valid Unicode character code point; otherwise, false. The range of Unicode code points is 0x0 - 0x10FFFF; however, some values within this range are reserved and are not valid characters.

The character property is a normative property of the Unicode Standard.

bool RWUCharTraits::isControl ( RWUChar32  cp)
inlinestatic

Returns true if cp is a code point for a control character; otherwise, false. The set of control characters is:

  • ISO 8-bit control characters (U+0000..U+001f and U+007f..U+009f)
  • Characters assigned to the Cc (Other, control) category
  • Characters assigned to the Cf (Other, format) category
  • Characters assigned to the Zl (Separator, line) category
  • Characters assigned to the Zp (Separator, paragraph) category

This set is defined in the UnicodeData.txt file of the Unicode Character Database, as described in:

http://www.unicode.org/reports/tr44/

The control property is a normative property of the Unicode Standard.

bool RWUCharTraits::isDecimalDigit ( RWUChar32  cp)
inlinestatic

Returns true if cp is a code point for a decimal digit; otherwise, false. Decimal digits are identified using information in the file

http://www.unicode.org/Public/UNIDATA/UnicodeData.txt

according to the method described in

http://www.unicode.org/reports/tr44/

The decimal digit property is a normative property of the Unicode Standard.

bool RWUCharTraits::isDefined ( RWUChar32  cp)
inlinestatic

Returns true if cp is the code point for a character named in the Unicode Character Database. A defined character is assigned various properties under the Unicode Standard. These properties can be accessed using the methods provided by this class.

The defined property is a normative property of the Unicode Standard.

bool RWUCharTraits::isDigit ( RWUChar32  cp)
inlinestatic

Returns true if cp is a code point for a digit; otherwise, false. Digits are identified using information in the file UnicodeData.txt of the Unicode Character Database:

http://www.unicode.org/reports/tr44/

The digit property is a normative property of the Unicode Standard.

See also
isDecimalDigit(), isNumeric()
bool RWUCharTraits::isError ( RWUChar32  cp)
inlinestatic

Returns true if cp is an error code and not a valid character code point; otherwise, false. Some RWUChar32 values have been reserved for use in reporting errors.

The error property is a private, implementation-defined property allowed by the Unicode Standard.

bool RWUCharTraits::isHighSurrogate ( RWUChar16  cu)
inlinestatic

Returns true if cu is the first, or high, code unit of a surrogate pair; otherwise, false. A high surrogate has a value in the range [U+D800..U+DBFF].

The surrogate property is a normative property of the Unicode Standard.

See also
isSurrogate(), isLowSurrogate(), isSingle()
bool RWUCharTraits::isLetter ( RWUChar32  cp)
inlinestatic

Returns true if cp is a code point for a letter; otherwise, false. The value of cp is the code point for a letter if it:

  • refers to a valid character in the file UnicodeData.txt of the Unicode Character Database, and the general category property for that character is one of Ll, Lu, Lt, Lm, or Lo, as described in
    http://www.unicode.org/reports/tr44/
  • is listed in the "Other_Alphabetic" section of the PropList.txt file, as described in
    http://www.unicode.org/reports/tr44/
    The letter property is an informative property of the Unicode Standard.
See also
getGeneralCategory(), isDigit()
bool RWUCharTraits::isLower ( RWUChar32  cp)
inlinestatic

Returns true if cp is a code point for a lowercase letter; otherwise, false. The value is the code point for a lowercase letter if it refers to a valid character in the file UnicodeData.txt of the Unicode Character Database, and the general category property for that character is Ll, as described in:

http://www.unicode.org/reports/tr44/

The lowercase property is a normative property of the Unicode Standard.

See also
getGeneralCategory(), isUpper(), isTitle()
bool RWUCharTraits::isLowSurrogate ( RWUChar16  cu)
inlinestatic

Returns true if cu is the second, or low, code unit of a surrogate pair; otherwise, false. A low-surrogate has a value in the range [U+DC00..U+DFFF].

The surrogate property is a normative property of the Unicode Standard.

bool RWUCharTraits::isMirrored ( RWUChar32  cp)
inlinestatic

Returns true if cp is a code point for a mirrored character; otherwise, false. Mirroring is a property of characters, such as parentheses, whose images are reflected horizontally in text that is laid out right to left. For example, the left parens is the opening parens in left-to-right text, but in right-to-left text the mirrored right parens is the opening parens. The Unicode mirrored property is described in Unicode Standard Annex #9, "The Bidirectional Algorithm":

http://www.unicode.org/reports/tr9/

The mirrored property is a normative property of the Unicode Standard.

bool RWUCharTraits::isNumeric ( RWUChar32  cp)
inlinestatic

Returns true if cp is a code point for a numeric character; otherwise, false. The set of numeric characters is a superset of the set of digit characters, and includes additional characters such as vulgar fractions and Roman numerals.

Numeric characters are identified using information in the UnicodeData.txt file of the Unicode Character Database:

http://www.unicode.org/reports/tr44/

The numeric property is a normative property of the Unicode Standard.

bool RWUCharTraits::isPunctuation ( RWUChar32  cp)
inlinestatic

Returns true if cp is a code point for a punctuation character; otherwise, false.

Punctuation characters are identified using information in the UnicodeData.txt file of the Unicode Character Database:

http://www.unicode.org/reports/tr44/

The punctuation property is a normative property of the Unicode Standard.

bool RWUCharTraits::isSingle ( RWUChar16  cu)
inlinestatic

Returns true if cu is a code unit that corresponds to a single code point, or false if the value is either part of a surrogate pair consisting of two code units.

The surrogate property is a normative property of the Unicode Standard.

See also
isSurrogate(), isHighSurrogate(), isLowSurrogate()
bool RWUCharTraits::isSpace ( RWUChar32  cp)
inlinestatic

Returns true if cp is a code point for a spacing character; otherwise, false. Space characters are identified in the Unicode Character Database:

http://www.unicode.org/reports/tr44/

The space property is an informative property of the Unicode Standard.

bool RWUCharTraits::isSurrogate ( RWUChar16  cu)
inlinestatic

Returns true if cu is a surrogate in the range U+D800 through U+DFFF; otherwise, false.

Surrogates are not characters; they are reserved for use in UTF-16 strings as leading and trailing code units of multi-unit sequences used to encode single code points.

The surrogate property is a normative property of the Unicode Standard.

See also
isSingle(), isHighSurrogate(), isLowSurrogate().
bool RWUCharTraits::isTitle ( RWUChar32  cp)
inlinestatic

Returns true if cp is a code point for a titlecase letter; otherwise, false. The value is the code point for a titlecase letter if it refers to a valid character in the file UnicodeData.txt of the Unicode Character Database, and the general category property for that character is Lu or Lt, as described in

http://www.unicode.org/reports/tr44/

The titlecase property is a normative property of the Unicode Standard.

bool RWUCharTraits::isUpper ( RWUChar32  cp)
inlinestatic

Returns true if cp is a code point for an uppercase letter; otherwise, false. The value is a code point for an uppercase letter if it either:

  • refers to a valid character in the file UnicodeData.txt of the Unicode Character Database, and the general category property for that character is Lu, as described in
    http://www.unicode.org/reports/tr44/
  • is listed in the "Other_Uppercase" section of the PropList.txt file, as described in
    http://www.unicode.org/reports/tr44/
    The uppercase property is a normative property of the Unicode Standard.
bool RWUCharTraits::isWhitespace ( RWUChar32  cp)
inlinestatic

Returns true if cp is recognized as whitespace.

Whitespace is not a property of the Unicode Standard.

See also
getWhitespace()
bool RWUCharTraits::requiresSurrogatePair ( RWUChar32  cp)
inlinestatic

Returns true if cp is greater than 0xFFFF and therefore would require a surrogate representation; otherwise, false

Surrogates are a normative requirement of the Unicode Standard.

Note
For efficiency, this method does not check whether the given code point is valid.
RWUChar32 RWUCharTraits::toLower ( RWUChar32  cp)
inlinestatic

Returns the lowercase equivalent for the given character code point cp. If the code point has no lowercase equivalent, the code point itself is returned. The character is mapped to its lowercase equivalent using information in the Unicode Character Database, as described in:

http://www.unicode.org/reports/tr44/

This method can only handle simple case conversions. For locale-specific case conversions, use RWUString::toLower().

The case property is a normative property of the Unicode Standard. The lowercase mapping is an informative property of the Unicode Standard.

This method is equivalent to the ANSI/ISO-C toLower() function.

RWUChar32 RWUCharTraits::toTitle ( RWUChar32  cp)
inlinestatic

Returns the titlecase equivalent for cp. The titlecase mapping of a character is determined using information in the file UnicodeData.txt of the Unicode Character Database, as described in:

http://www.unicode.org/reports/tr44/

If the character has no titlecase equivalent, the character itself is returned.

The case property is a normative property of the Unicode Standard. A titlecase mapping is an informative property of the Unicode Standard.

Note
This method can only map characters that map to a single character. This method cannot be used for mappings where the source character maps to more than one character, or where the mapping is context-dependent. The single character mappings provided by this method are insufficient for languages such as German. For full case mappings, use RWUString::toTitle().
RWUChar32 RWUCharTraits::toUpper ( RWUChar32  cp)
inlinestatic

Returns the uppercase equivalent for cp. The character is mapped to its uppercase equivalent using information in the Unicode Character Database, as described in:

http://www.unicode.org/reports/tr44/

If the code point has no uppercase equivalent, the code point itself is returned.

This method can only perform general letter case conversion. For language-specific case conversions, use RWUString::toUpper().

The case property is a normative property of the Unicode Standard. An uppercase mapping is an informative property of the Unicode Standard.

This method is equivalent to the ANSI/ISO-C toUpper() function.

Copyright © 2022 Rogue Wave Software, Inc., a Perforce company. All Rights Reserved.