unicode
This module provides support to handle the Unicode UTF-8 encoding.
There are no specialized insert
, delete
, add
and contains
procedures for seq[Rune]
in this module because the generic variants of these procedures in the system module already work with it.
The current version is compatible with Unicode v12.0.0.
See also:
Types
Rune = distinct RuneImpl
-
Type that can hold a single Unicode code point.
A Rune may be composed with other Runes to a character on the screen.
Source EditRuneImpl
is the underlying type used to store Runes, currentlyint32
.
Procs
proc runeLen(s: string): int {...}{.gcsafe, extern: "nuc$1", raises: [], tags: [].}
- Returns the number of runes of the string
s
.Example:
let a = "añyóng" doAssert a.runeLen == 6 ## note: a.len == 8
Source Edit proc runeLenAt(s: string; i: Natural): int {...}{.raises: [], tags: [].}
-
Returns the number of bytes the rune starting at
s[i]
takes.See also:
Example:
let a = "añyóng" doAssert a.runeLenAt(0) == 1 doAssert a.runeLenAt(1) == 2
Source Edit proc runeAt(s: string; i: Natural): Rune {...}{.raises: [], tags: [].}
-
Returns the rune in
s
at byte indexi
.See also:
Example:
let a = "añyóng" doAssert a.runeAt(1) == "ñ".runeAt(0) doAssert a.runeAt(2) == "ñ".runeAt(1) doAssert a.runeAt(3) == "y".runeAt(0)
Source Edit proc validateUtf8(s: string): int {...}{.raises: [], tags: [].}
-
Returns the position of the invalid byte in
s
if the strings
does not hold valid UTF-8 data. Otherwise-1
is returned.See also:
- toUTF8 proc
-
$ proc alias for
toUTF8
- fastToUTF8Copy template
proc toUTF8(c: Rune): string {...}{.gcsafe, extern: "nuc$1", raises: [], tags: [].}
-
Converts a rune into its UTF-8 representation.
See also:
- validateUtf8 proc
-
$ proc alias for
toUTF8
- utf8 iterator
- fastToUTF8Copy template
Example:
let a = "añyóng" doAssert a.runeAt(1).toUTF8 == "ñ"
Source Edit proc add(s: var string; c: Rune) {...}{.raises: [], tags: [].}
- Adds a rune
c
to a strings
.Example:
var s = "abc" let c = "ä".runeAt(0) s.add(c) doAssert s == "abcä"
Source Edit proc `$`(rune: Rune): string {...}{.raises: [], tags: [].}
-
An alias for toUTF8.
See also:
Source Edit proc `$`(runes: seq[Rune]): string {...}{.raises: [], tags: [].}
-
Converts a sequence of Runes to a string.
See also:
- toRunes for a reverse operation
Example:
let someString = "öÑ" someRunes = toRunes(someString) doAssert $someRunes == someString
Source Edit proc runeOffset(s: string; pos: Natural; start: Natural = 0): int {...}{.raises: [], tags: [].}
-
Returns the byte position of rune at position
pos
ins
with an optional start byte position. Returns the special value -1 if it runs out of the string.Beware: This can lead to unoptimized code and slow execution! Most problems can be solved more efficiently by using an iterator or conversion to a seq of Rune.
See also:
Example:
let a = "añyóng" doAssert a.runeOffset(1) == 1 doAssert a.runeOffset(3) == 4 doAssert a.runeOffset(4) == 6
Source Edit proc runeReverseOffset(s: string; rev: Positive): (int, int) {...}{.raises: [], tags: [].}
-
Returns a tuple with the byte offset of the rune at position
rev
ins
, counting from the end (starting with 1) and the total number of runes in the string.Returns a negative value for offset if there are to few runes in the string to satisfy the request.
Beware: This can lead to unoptimized code and slow execution! Most problems can be solved more efficiently by using an iterator or conversion to a seq of Rune.
See also:
Source Edit proc runeAtPos(s: string; pos: int): Rune {...}{.raises: [], tags: [].}
-
Returns the rune at position
pos
.Beware: This can lead to unoptimized code and slow execution! Most problems can be solved more efficiently by using an iterator or conversion to a seq of Rune.
See also:
Source Edit proc runeStrAtPos(s: string; pos: Natural): string {...}{.raises: [], tags: [].}
-
Returns the rune at position
pos
as UTF8 String.Beware: This can lead to unoptimized code and slow execution! Most problems can be solved more efficiently by using an iterator or conversion to a seq of Rune.
See also:
Source Edit proc runeSubStr(s: string; pos: int; len: int = int.high): string {...}{.raises: [], tags: [].}
-
Returns the UTF-8 substring starting at code point
pos
withlen
code points.If
pos
orlen
is negative they count from the end of the string. Iflen
is not given it means the longest possible string.Example:
let s = "Hänsel ««: 10,00€" doAssert(runeSubStr(s, 0, 2) == "Hä") doAssert(runeSubStr(s, 10, 1) == ":") doAssert(runeSubStr(s, -6) == "10,00€") doAssert(runeSubStr(s, 10) == ": 10,00€") doAssert(runeSubStr(s, 12, 5) == "10,00") doAssert(runeSubStr(s, -6, 3) == "10,")
Source Edit proc `<=%`(a, b: Rune): bool {...}{.raises: [], tags: [].}
- Checks if code point of
a
is smaller or equal to code point ofb
.Example:
let a = "ú".runeAt(0) b = "ü".runeAt(0) doAssert a <=% b
Source Edit proc `<%`(a, b: Rune): bool {...}{.raises: [], tags: [].}
- Checks if code point of
a
is smaller than code point ofb
.Example:
let a = "ú".runeAt(0) b = "ü".runeAt(0) doAssert a <% b
Source Edit proc `==`(a, b: Rune): bool {...}{.raises: [], tags: [].}
- Checks if two runes are equal. Source Edit
proc toLower(c: Rune): Rune {...}{.gcsafe, extern: "nuc$1", raises: [], tags: [].}
-
Converts
c
into lower case. This works for any rune.If possible, prefer
toLower
overtoUpper
.See also:
Source Edit proc toUpper(c: Rune): Rune {...}{.gcsafe, extern: "nuc$1", raises: [], tags: [].}
-
Converts
c
into upper case. This works for any rune.If possible, prefer
toLower
overtoUpper
.See also:
Source Edit proc toTitle(c: Rune): Rune {...}{.gcsafe, extern: "nuc$1", raises: [], tags: [].}
-
Converts
c
to title case.See also:
Source Edit proc isLower(c: Rune): bool {...}{.gcsafe, extern: "nuc$1", raises: [], tags: [].}
-
Returns true if
c
is a lower case rune.If possible, prefer
isLower
overisUpper
.See also:
Source Edit proc isUpper(c: Rune): bool {...}{.gcsafe, extern: "nuc$1", raises: [], tags: [].}
-
Returns true if
c
is a upper case rune.If possible, prefer
isLower
overisUpper
.See also:
Source Edit proc isAlpha(c: Rune): bool {...}{.gcsafe, extern: "nuc$1", raises: [], tags: [].}
-
Returns true if
c
is an alpha rune (i.e., a letter).See also:
Source Edit proc isTitle(c: Rune): bool {...}{.gcsafe, extern: "nuc$1", raises: [], tags: [].}
-
Returns true if
c
is a Unicode titlecase code point.See also:
Source Edit proc isWhiteSpace(c: Rune): bool {...}{.gcsafe, extern: "nuc$1", raises: [], tags: [].}
-
Returns true if
c
is a Unicode whitespace code point.See also:
Source Edit proc isCombining(c: Rune): bool {...}{.gcsafe, extern: "nuc$1", raises: [], tags: [].}
-
Returns true if
c
is a Unicode combining code unit.See also:
Source Edit proc isAlpha(s: string): bool {...}{.noSideEffect, gcsafe, extern: "nuc$1Str", raises: [], tags: [].}
- Returns true if
s
contains all alphabetic runes.Example:
let a = "añyóng" doAssert a.isAlpha
Source Edit proc isSpace(s: string): bool {...}{.noSideEffect, gcsafe, extern: "nuc$1Str", raises: [], tags: [].}
- Returns true if
s
contains all whitespace runes.Example:
let a = "\t\l \v\r\f" doAssert a.isSpace
Source Edit proc toUpper(s: string): string {...}{.noSideEffect, gcsafe, extern: "nuc$1Str", raises: [], tags: [].}
- Converts
s
into upper-case runes.Example:
doAssert toUpper("abγ") == "ABΓ"
Source Edit proc toLower(s: string): string {...}{.noSideEffect, gcsafe, extern: "nuc$1Str", raises: [], tags: [].}
- Converts
s
into lower-case runes.Example:
doAssert toLower("ABΓ") == "abγ"
Source Edit proc swapCase(s: string): string {...}{.noSideEffect, gcsafe, extern: "nuc$1", raises: [], tags: [].}
-
Swaps the case of runes in
s
.Returns a new string such that the cases of all runes are swapped if possible.
Example:
doAssert swapCase("Αlpha Βeta Γamma") == "αLPHA βETA γAMMA"
Source Edit proc capitalize(s: string): string {...}{.noSideEffect, gcsafe, extern: "nuc$1", raises: [], tags: [].}
- Converts the first character of
s
into an upper-case rune.Example:
doAssert capitalize("βeta") == "Βeta"
Source Edit proc translate(s: string; replacements: proc (key: string): string): string {...}{. gcsafe, extern: "nuc$1", raises: [], tags: [].}
-
Translates words in a string using the
replacements
proc to substitute words insides
with their replacements.replacements
is any proc that takes a word and returns a new word to fill it's place.Example:
proc wordToNumber(s: string): string = case s of "one": "1" of "two": "2" else: s let a = "one two three four" doAssert a.translate(wordToNumber) == "1 2 three four"
Source Edit proc title(s: string): string {...}{.noSideEffect, gcsafe, extern: "nuc$1", raises: [], tags: [].}
-
Converts
s
to a unicode title.Returns a new string such that the first character in each word inside
s
is capitalized.Example:
doAssert title("αlpha βeta γamma") == "Αlpha Βeta Γamma"
Source Edit proc toRunes(s: string): seq[Rune] {...}{.raises: [], tags: [].}
-
Obtains a sequence containing the Runes in
s
.See also:
- $ proc for a reverse operation
Example:
let a = toRunes("aáä") doAssert a == @["a".runeAt(0), "á".runeAt(0), "ä".runeAt(0)]
Source Edit proc cmpRunesIgnoreCase(a, b: string): int {...}{.gcsafe, extern: "nuc$1", raises: [], tags: [].}
- Compares two UTF-8 strings and ignores the case. Returns:
0 if a == b
Source Edit
< 0 if a < b
> 0 if a > b proc reversed(s: string): string {...}{.raises: [], tags: [].}
-
Returns the reverse of
s
, interpreting it as runes.Unicode combining characters are correctly interpreted as well.
Example:
assert reversed("Reverse this!") == "!siht esreveR" assert reversed("先秦兩漢") == "漢兩秦先" assert reversed("as⃝df̅") == "f̅ds⃝a" assert reversed("a⃞b⃞c⃞") == "c⃞b⃞a⃞"
Source Edit proc graphemeLen(s: string; i: Natural): Natural {...}{.raises: [], tags: [].}
- The number of bytes belonging to byte index
s[i]
, including following combining code unit.Example:
let a = "añyóng" doAssert a.graphemeLen(1) == 2 ## ñ doAssert a.graphemeLen(2) == 1 doAssert a.graphemeLen(4) == 2 ## ó
Source Edit proc lastRune(s: string; last: int): (Rune, int) {...}{.raises: [], tags: [].}
- Length of the last rune in
s[0..last]
. Returns the rune and its length in bytes. Source Edit proc size(r: Rune): int {...}{.noSideEffect, raises: [], tags: [].}
- Returns the number of bytes the rune
r
takes.Example:
let a = toRunes "aá" doAssert size(a[0]) == 1 doAssert size(a[1]) == 2
Source Edit proc splitWhitespace(s: string): seq[string] {...}{.noSideEffect, gcsafe, extern: "ncuSplitWhitespace", raises: [], tags: [].}
- The same as the splitWhitespace iterator, but is a proc that returns a sequence of substrings. Source Edit
proc split(s: string; seps: openArray[Rune] = unicodeSpaces; maxsplit: int = -1): seq[ string] {...}{.noSideEffect, gcsafe, extern: "nucSplitRunes", raises: [], tags: [].}
- The same as the split iterator, but is a proc that returns a sequence of substrings. Source Edit
proc split(s: string; sep: Rune; maxsplit: int = -1): seq[string] {...}{. noSideEffect, gcsafe, extern: "nucSplitRune", raises: [], tags: [].}
- The same as the split iterator, but is a proc that returns a sequence of substrings. Source Edit
proc strip(s: string; leading = true; trailing = true; runes: openArray[Rune] = unicodeSpaces): string {...}{.noSideEffect, gcsafe, extern: "nucStrip", raises: [], tags: [].}
-
Strips leading or trailing
runes
froms
and returns the resulting string.If
leading
is true (default), leadingrunes
are stripped. Iftrailing
is true (default), trailingrunes
are stripped. If both are false, the string is returned unchanged.Example:
let a = "\táñyóng " doAssert a.strip == "áñyóng" doAssert a.strip(leading = false) == "\táñyóng" doAssert a.strip(trailing = false) == "áñyóng "
Source Edit proc repeat(c: Rune; count: Natural): string {...}{.noSideEffect, gcsafe, extern: "nucRepeatRune", raises: [], tags: [].}
-
Returns a string of
count
Runesc
.The returned string will have a rune-length of
count
.Example:
let a = "ñ".runeAt(0) doAssert a.repeat(5) == "ñññññ"
Source Edit proc align(s: string; count: Natural; padding = ' '.Rune): string {...}{. noSideEffect, gcsafe, extern: "nucAlignString", raises: [], tags: [].}
-
Aligns a unicode string
s
withpadding
, so that it has a rune-length ofcount
.padding
characters (by default spaces) are added befores
resulting in right alignment. Ifs.runelen >= count
, no spaces are added ands
is returned unchanged. If you need to left align a string use the alignLeft proc.Example:
assert align("abc", 4) == " abc" assert align("a", 0) == "a" assert align("1232", 6) == " 1232" assert align("1232", 6, '#'.Rune) == "##1232" assert align("Åge", 5) == " Åge" assert align("×", 4, '_'.Rune) == "___×"
Source Edit proc alignLeft(s: string; count: Natural; padding = ' '.Rune): string {...}{. noSideEffect, raises: [], tags: [].}
-
Left-aligns a unicode string
s
withpadding
, so that it has a rune-length ofcount
.padding
characters (by default spaces) are added afters
resulting in left alignment. Ifs.runelen >= count
, no spaces are added ands
is returned unchanged. If you need to right align a string use the align proc.Example:
assert alignLeft("abc", 4) == "abc " assert alignLeft("a", 0) == "a" assert alignLeft("1232", 6) == "1232 " assert alignLeft("1232", 6, '#'.Rune) == "1232##" assert alignLeft("Åge", 5) == "Åge " assert alignLeft("×", 4, '_'.Rune) == "×___"
Source Edit
Iterators
iterator runes(s: string): Rune {...}{.raises: [], tags: [].}
- Iterates over any rune of the string
s
returning runes. Source Edit iterator utf8(s: string): string {...}{.raises: [], tags: [].}
-
Iterates over any rune of the string
s
returning utf8 values.See also:
- validateUtf8 proc
- toUTF8 proc
-
$ proc alias for
toUTF8
- fastToUTF8Copy template
iterator split(s: string; seps: openArray[Rune] = unicodeSpaces; maxsplit: int = -1): string {...}{.raises: [], tags: [].}
-
Splits the unicode string
s
into substrings using a group of separators.Substrings are separated by a substring containing only
seps
.for word in split("this\lis an\texample"): writeLine(stdout, word)
...generates this output:
"this" "is" "an" "example"
And the following code:
for word in split("this:is;an$example", {';', ':', '$'}): writeLine(stdout, word)
...produces the same output as the first example. The code:
let date = "2012-11-20T22:08:08.398990" let separators = {' ', '-', ':', 'T'} for number in split(date, separators): writeLine(stdout, number)
...results in:
"2012" "11" "20" "22" "08" "08.398990"
Source Edit iterator splitWhitespace(s: string): string {...}{.raises: [], tags: [].}
- Splits a unicode string at whitespace runes. Source Edit
iterator split(s: string; sep: Rune; maxsplit: int = -1): string {...}{.raises: [], tags: [].}
-
Splits the unicode string
s
into substrings using a single separator.Substrings are separated by the rune
sep
. The code:for word in split(";;this;is;an;;example;;;", ';'): writeLine(stdout, word)
Results in:
"" "" "this" "is" "an" "" "example" "" "" ""
Source Edit
Templates
template fastRuneAt(s: string; i: int; result: untyped; doInc = true)
-
Returns the rune
s[i]
inresult
.If
Source EditdoInc == true
(default),i
is incremented by the number of bytes that have been processed. template fastToUTF8Copy(c: Rune; s: var string; pos: int; doInc = true)
-
Copies UTF-8 representation of
c
into the preallocated strings
starting at positionpos
.If
doInc == true
(default),pos
is incremented by the number of bytes that have been processed.To be the most efficient, make sure
s
is preallocated with an additional amount equal to the byte length ofc
.See also:
- validateUtf8 proc
- toUTF8 proc
-
$ proc alias for
toUTF8
© 2006–2021 Andreas Rumpf
Licensed under the MIT License.
https://nim-lang.org/docs/unicode.html