Package org.jcodings
Class Encoding
- java.lang.Object
-
- org.jcodings.Encoding
-
- All Implemented Interfaces:
java.lang.Cloneable
- Direct Known Subclasses:
AbstractEncoding
public abstract class Encoding extends java.lang.Object implements java.lang.Cloneable
-
-
Field Summary
Fields Modifier and Type Field Description static int
CHAR_INVALID
private java.nio.charset.Charset
charset
private static int
count
private int
hashCode
private int
index
private boolean
isAsciiCompatible
private boolean
isDummy
private boolean
isFixedWidth
private boolean
isSingleByte
protected boolean
isUnicode
protected boolean
isUTF8
protected int
maxLength
protected int
minLength
private byte[]
name
static byte
NEW_LINE
private java.lang.String
stringName
-
Constructor Summary
Constructors Modifier Constructor Description protected
Encoding(java.lang.String name, int minLength, int maxLength)
-
Method Summary
All Methods Static Methods Instance Methods Abstract Methods Concrete Methods Deprecated Methods Modifier and Type Method Description abstract void
applyAllCaseFold(int flag, ApplyAllCaseFoldFunction fun, java.lang.Object arg)
Expand case folds given a character class (used for case insensitive matching)static byte
asciiToLower(int c)
static byte
asciiToUpper(int c)
abstract CaseFoldCodeItem[]
caseFoldCodesByString(int flag, byte[] bytes, int p, int end)
Expand AST string nodes into their folded alternatives (look at:Analyser.expandCaseFoldString
) Oniguruma equivalent:get_case_fold_codes_by_str
abstract int
caseMap(IntHolder flagP, byte[] bytes, IntHolder pp, int end, byte[] to, int toP, int toEnd)
Oniguruma equivalent:case_map
abstract int
codeToMbc(int code, byte[] bytes, int p)
Extracts code point into it's multibyte representationabstract int
codeToMbcLength(int code)
Returns character length given a code point Oniguruma equivalent:code_to_mbclen
abstract int[]
ctypeCodeRange(int ctype, IntHolder sbOut)
Returns code range for a given character type Oniguruma equivalent:get_ctype_code_range
static int
digitVal(int code)
boolean
equals(java.lang.Object other)
java.nio.charset.Charset
getCharset()
If this encoding is capable of being represented by a Java Charset then provide it.java.lang.String
getCharsetName()
The name of the equivalent Java Charset for this encoding.int
getIndex()
byte[]
getName()
int
hashCode()
boolean
isAlnum(int code)
boolean
isAlpha(int code)
static boolean
isAscii(byte b)
static boolean
isAscii(int code)
boolean
isAsciiCompatible()
boolean
isBlank(int code)
boolean
isCntrl(int code)
abstract boolean
isCodeCType(int code, int ctype)
Perform a check whether given code is of given character type (e.g.boolean
isDigit(int code)
boolean
isDummy()
boolean
isFixedWidth()
boolean
isGraph(int code)
boolean
isLower(int code)
static boolean
isMbcAscii(byte b)
boolean
isMbcCrnl(byte[] bytes, int p, int end)
boolean
isMbcHead(byte[] bytes, int p, int end)
boolean
isMbcWord(byte[] bytes, int p, int end)
abstract boolean
isNewLine(byte[] bytes, int p, int end)
Returns true ifbytes[p]
is a head of a new line character Oniguruma equivalent:is_mbc_newline
boolean
isNewLine(int code)
boolean
isPrint(int code)
boolean
isPunct(int code)
abstract boolean
isReverseMatchAllowed(byte[] bytes, int p, int end)
Returns true if it's safe to use reversal Boyer-Moore search fail fast algorithm Oniguruma equivalent:is_allowed_reverse_match
boolean
isSbWord(int code)
boolean
isSingleByte()
boolean
isSpace(int code)
boolean
isUnicode()
boolean
isUpper(int code)
boolean
isUTF8()
boolean
isWord(int code)
static boolean
isWordGraphPrint(int ctype)
boolean
isXDigit(int code)
abstract int
leftAdjustCharHead(byte[] bytes, int p, int s, int end)
Seeks the previous character head in a stream Oniguruma equivalent:left_adjust_char_head
abstract int
length(byte c)
Returns character length given character head returns1
for singlebyte encodings or performs direct length table lookup for multibyte ones.abstract int
length(byte[] bytes, int p, int end)
Returns character length given stream, character position and stream end returns1
for singlebyte encodings or performs sanity validations for multibyte ones and returns the character length, missing characters in the stream otherwisestatic Encoding
load(java.lang.String name)
static Encoding
load(java.lang.String name, java.lang.String pkg)
int
maxLength()
Returns maximum character byte length that can appear in an encoding Oniguruma equivalent:max_enc_len
int
maxLengthDistance()
Deprecated.abstract int
mbcCaseFold(int flag, byte[] bytes, IntHolder pp, int end, byte[] to)
Performs case folding for a character atbytes[pp.value]
int
mbcodeStartPosition()
Deprecated.abstract int
mbcToCode(byte[] bytes, int p, int end)
Returns code point for a character Oniguruma equivalent:mbc_to_code
int
minLength()
Returns minimum character byte length that can appear in an encoding Oniguruma equivalent:min_enc_len
static int
odigitVal(int code)
int
prevCharHead(byte[] bytes, int p, int s, int end)
abstract int
propertyNameToCType(byte[] bytes, int p, int end)
Returns character type given character type name (used when e.g.(package private) Encoding
replicate(byte[] name)
int
rightAdjustCharHead(byte[] bytes, int p, int s, int end)
int
rightAdjustCharHeadWithPrev(byte[] bytes, int p, int s, int end, IntHolder prev)
protected void
setDummy()
protected void
setName(byte[] name)
protected void
setName(java.lang.String name)
int
step(byte[] bytes, int p, int end, int n)
int
stepBack(byte[] bytes, int p, int s, int end, int n)
int
strByteLengthNull(byte[] bytes, int p, int end)
abstract int
strCodeAt(byte[] bytes, int p, int end, int index)
abstract int
strLength(byte[] bytes, int p, int end)
int
strLengthNull(byte[] bytes, int p, int end)
int
strNCmp(byte[] bytes, int p, int end, byte[] ascii, int asciiP, int n)
byte[]
toLowerCaseTable()
Returns lower case table if it's safe to use it directly, otherwisenull
Used for fast case insensitive matching for some singlebyte encodingsjava.lang.String
toString()
int
xdigitVal(int code)
-
-
-
Field Detail
-
CHAR_INVALID
public static final int CHAR_INVALID
- See Also:
- Constant Field Values
-
count
private static int count
-
minLength
protected final int minLength
-
maxLength
protected final int maxLength
-
isFixedWidth
private final boolean isFixedWidth
-
isSingleByte
private final boolean isSingleByte
-
isAsciiCompatible
private boolean isAsciiCompatible
-
isUnicode
protected boolean isUnicode
-
isUTF8
protected boolean isUTF8
-
name
private byte[] name
-
hashCode
private int hashCode
-
index
private int index
-
charset
private java.nio.charset.Charset charset
-
isDummy
private boolean isDummy
-
stringName
private java.lang.String stringName
-
NEW_LINE
public static final byte NEW_LINE
- See Also:
- Constant Field Values
-
-
Method Detail
-
setName
protected final void setName(java.lang.String name)
-
setName
protected final void setName(byte[] name)
-
setDummy
protected final void setDummy()
-
toString
public final java.lang.String toString()
- Overrides:
toString
in classjava.lang.Object
-
equals
public final boolean equals(java.lang.Object other)
- Overrides:
equals
in classjava.lang.Object
-
hashCode
public final int hashCode()
- Overrides:
hashCode
in classjava.lang.Object
-
getIndex
public final int getIndex()
-
getName
public final byte[] getName()
-
isDummy
public final boolean isDummy()
-
isAsciiCompatible
public final boolean isAsciiCompatible()
-
isUnicode
public final boolean isUnicode()
-
isUTF8
public final boolean isUTF8()
-
getCharset
public java.nio.charset.Charset getCharset()
If this encoding is capable of being represented by a Java Charset then provide it. Otherwise this will raise a CharsetNotFound error via the JDK APIs. To reduce cases like jruby/jruby#4716, we always attempt to find a charset here, and default to using the encoding name which is never null. Either the encoding will exist in the JDK or it will fail hard, rather than propagating a null Charset. Encodings with names different than those found in the JDK can override this getCharsetName to provide that name or getCharset to return the right Charset.
-
getCharsetName
public java.lang.String getCharsetName()
The name of the equivalent Java Charset for this encoding. Defaults to the name of the encoding. Subclasses can override this to provide a different name.- Returns:
- the name of the equivalent Java Charset for this encoding
-
replicate
Encoding replicate(byte[] name)
-
length
public abstract int length(byte c)
Returns character length given character head returns1
for singlebyte encodings or performs direct length table lookup for multibyte ones.- Parameters:
c
- Character head Oniguruma equivalent:mbc_enc_len
To be deprecated very soon (use length(byte[]bytes, int p, int end) version)
-
length
public abstract int length(byte[] bytes, int p, int end)
Returns character length given stream, character position and stream end returns1
for singlebyte encodings or performs sanity validations for multibyte ones and returns the character length, missing characters in the stream otherwise- Returns:
- 0 Never
> 0 Valid character, length returned
-1 Illegal/malformed character
< -1 (-1 - n) Number of missing bytes for character in p...end range
Oniguruma equivalent:
mbc_enc_len
modified for 1.9 purposes,
-
maxLength
public final int maxLength()
Returns maximum character byte length that can appear in an encoding Oniguruma equivalent:max_enc_len
-
maxLengthDistance
@Deprecated public final int maxLengthDistance()
Deprecated.
-
minLength
public final int minLength()
Returns minimum character byte length that can appear in an encoding Oniguruma equivalent:min_enc_len
-
isNewLine
public abstract boolean isNewLine(byte[] bytes, int p, int end)
Returns true ifbytes[p]
is a head of a new line character Oniguruma equivalent:is_mbc_newline
-
mbcToCode
public abstract int mbcToCode(byte[] bytes, int p, int end)
Returns code point for a character Oniguruma equivalent:mbc_to_code
-
codeToMbcLength
public abstract int codeToMbcLength(int code)
Returns character length given a code point Oniguruma equivalent:code_to_mbclen
-
codeToMbc
public abstract int codeToMbc(int code, byte[] bytes, int p)
Extracts code point into it's multibyte representation- Returns:
- character length for the given code point
Oniguruma equivalent:
code_to_mbc
-
mbcCaseFold
public abstract int mbcCaseFold(int flag, byte[] bytes, IntHolder pp, int end, byte[] to)
Performs case folding for a character atbytes[pp.value]
- Parameters:
flag
- case fold flagpp
- anIntHolder
that points at character headto
- a buffer where to extract case folded character Oniguruma equivalent:mbc_case_fold
-
toLowerCaseTable
public byte[] toLowerCaseTable()
Returns lower case table if it's safe to use it directly, otherwisenull
Used for fast case insensitive matching for some singlebyte encodings- Returns:
- lower case table
-
applyAllCaseFold
public abstract void applyAllCaseFold(int flag, ApplyAllCaseFoldFunction fun, java.lang.Object arg)
Expand case folds given a character class (used for case insensitive matching)- Parameters:
flag
- case fold flagfun
- case folding functor (look at:ApplyCaseFold
)arg
- case folding functor argument (look at:ApplyCaseFoldArg
) Oniguruma equivalent:apply_all_case_fold
-
caseFoldCodesByString
public abstract CaseFoldCodeItem[] caseFoldCodesByString(int flag, byte[] bytes, int p, int end)
Expand AST string nodes into their folded alternatives (look at:Analyser.expandCaseFoldString
) Oniguruma equivalent:get_case_fold_codes_by_str
-
propertyNameToCType
public abstract int propertyNameToCType(byte[] bytes, int p, int end)
Returns character type given character type name (used when e.g. \p{Alpha}) Oniguruma equivalent:property_name_to_ctype
-
isCodeCType
public abstract boolean isCodeCType(int code, int ctype)
Perform a check whether given code is of given character type (e.g. used by isWord(someByte) and similar methods)- Parameters:
code
- a code point of a characterctype
- a character type to check against Oniguruma equivalent:is_code_ctype
-
ctypeCodeRange
public abstract int[] ctypeCodeRange(int ctype, IntHolder sbOut)
Returns code range for a given character type Oniguruma equivalent:get_ctype_code_range
-
leftAdjustCharHead
public abstract int leftAdjustCharHead(byte[] bytes, int p, int s, int end)
Seeks the previous character head in a stream Oniguruma equivalent:left_adjust_char_head
- Parameters:
bytes
- byte streamp
- positions
- stopend
- end
-
isReverseMatchAllowed
public abstract boolean isReverseMatchAllowed(byte[] bytes, int p, int end)
Returns true if it's safe to use reversal Boyer-Moore search fail fast algorithm Oniguruma equivalent:is_allowed_reverse_match
-
caseMap
public abstract int caseMap(IntHolder flagP, byte[] bytes, IntHolder pp, int end, byte[] to, int toP, int toEnd)
Oniguruma equivalent:case_map
-
rightAdjustCharHead
public final int rightAdjustCharHead(byte[] bytes, int p, int s, int end)
-
rightAdjustCharHeadWithPrev
public final int rightAdjustCharHeadWithPrev(byte[] bytes, int p, int s, int end, IntHolder prev)
-
prevCharHead
public final int prevCharHead(byte[] bytes, int p, int s, int end)
-
stepBack
public final int stepBack(byte[] bytes, int p, int s, int end, int n)
-
step
public final int step(byte[] bytes, int p, int end, int n)
-
strLength
public abstract int strLength(byte[] bytes, int p, int end)
-
strCodeAt
public abstract int strCodeAt(byte[] bytes, int p, int end, int index)
-
strLengthNull
public final int strLengthNull(byte[] bytes, int p, int end)
-
strByteLengthNull
public final int strByteLengthNull(byte[] bytes, int p, int end)
-
strNCmp
public final int strNCmp(byte[] bytes, int p, int end, byte[] ascii, int asciiP, int n)
-
isNewLine
public final boolean isNewLine(int code)
-
isGraph
public final boolean isGraph(int code)
-
isPrint
public final boolean isPrint(int code)
-
isAlnum
public final boolean isAlnum(int code)
-
isAlpha
public final boolean isAlpha(int code)
-
isLower
public final boolean isLower(int code)
-
isUpper
public final boolean isUpper(int code)
-
isCntrl
public final boolean isCntrl(int code)
-
isPunct
public final boolean isPunct(int code)
-
isSpace
public final boolean isSpace(int code)
-
isBlank
public final boolean isBlank(int code)
-
isDigit
public final boolean isDigit(int code)
-
isXDigit
public final boolean isXDigit(int code)
-
isWord
public final boolean isWord(int code)
-
isMbcWord
public final boolean isMbcWord(byte[] bytes, int p, int end)
-
isSbWord
public final boolean isSbWord(int code)
-
isMbcHead
public final boolean isMbcHead(byte[] bytes, int p, int end)
-
isMbcCrnl
public boolean isMbcCrnl(byte[] bytes, int p, int end)
-
digitVal
public static int digitVal(int code)
-
odigitVal
public static int odigitVal(int code)
-
xdigitVal
public final int xdigitVal(int code)
-
isMbcAscii
public static boolean isMbcAscii(byte b)
-
isAscii
public static boolean isAscii(int code)
-
isAscii
public static boolean isAscii(byte b)
-
asciiToLower
public static byte asciiToLower(int c)
-
asciiToUpper
public static byte asciiToUpper(int c)
-
isWordGraphPrint
public static boolean isWordGraphPrint(int ctype)
-
mbcodeStartPosition
@Deprecated public final int mbcodeStartPosition()
Deprecated.
-
isSingleByte
public final boolean isSingleByte()
-
isFixedWidth
public final boolean isFixedWidth()
-
load
public static Encoding load(java.lang.String name)
-
load
public static Encoding load(java.lang.String name, java.lang.String pkg)
-
-