from sparse_encoders import create_measure_tables
create_measure_tables()

number of codes =  128
number of code_exceptions =  0
number of codes =  1984
number of code_exceptions =  128
number of codes =  59328
number of code_exceptions =  4224
number of codes =  1107904
number of code_exceptions =  790656
| Segments | exec_time (sec) |  matrix_shape | Size in Disk (MB): | Matrix Size in Disk (MB):            | Sparse Matrix Size in Disk (MB): |code path
| 1 | 0.004 | (128, 260) | 0.77 | 0.25 | 0.00 | codes/utf8_codes-1seg.pkl |
| 2 | 0.094 | (1984, 324) | 14.83 | 4.90 | 0.09 | codes/utf8_codes-2seg.pkl |
| 3 | 2.051 | (59328, 388) | 530.26 | 175.62 | 3.59 | codes/utf8_codes-3seg.pkl |
| 4 | 45.606 | (1107904, 452) | 11530.14 | 3820.59 | 83.59 | codes/utf8_codes-4seg.pkl |


from sparse_encoders import sparse_Nk_dimension_analysis


%%time
results = sparse_Nk_dimension_analysis()
len(results)

CPU times: user 12.9 ms, sys: 358 µs, total: 13.2 ms
Wall time: 13.1 ms

18


# results: (code points needed, possible code points, vector size, number of  ones in code, sparsity ratio)
results

[(128, 136, 17, 2, '0.118'),
 (128, 165, 11, 3, '0.273'),
 (128, 210, 10, 4, '0.400'),
 (128, 252, 10, 5, '0.500'),
 (128, 210, 10, 6, '0.600'),
 (1984, 2016, 64, 2, '0.031'),
 (1984, 2024, 24, 3, '0.125'),
 (1984, 2380, 17, 4, '0.235'),
 (1984, 2002, 14, 5, '0.357'),
 (1984, 3003, 14, 6, '0.429'),
 (59328, 59640, 72, 3, '0.042'),
 (59328, 66045, 37, 4, '0.108'),
 (59328, 65780, 26, 5, '0.192'),
 (59328, 74613, 22, 6, '0.273'),
 (1107904, 1125180, 190, 3, '0.016'),
 (1107904, 1150626, 74, 4, '0.054'),
 (1107904, 1221759, 45, 5, '0.111'),
 (1107904, 1344904, 34, 6, '0.176')]


%%time
from sparse_encoders import multihot_primes_conf_finder
_, codes_1seg, codes_2seg, codes_3seg, codes_4seg= multihot_primes_conf_finder()

CPU times: user 332 ms, sys: 8.77 ms, total: 341 ms
Wall time: 340 ms


codes_1seg[0], codes_2seg[0], codes_3seg[0], codes_4seg[0]

(((2, 3, 5, 7), 4, 0.235, 17, 210),
 ((3, 5, 11, 13), 4, 0.125, 32, 2145),
 ((11, 13, 19, 23), 4, 0.061, 66, 62491),
 ((23, 31, 37, 43), 4, 0.03, 134, 1134383))


codes_1seg[:5], codes_2seg[:5], codes_3seg[:5], codes_4seg[:5]

([((2, 3, 5, 7), 4, 0.235, 17, 210),
  ((3, 5, 11), 3, 0.158, 19, 165),
  ((2, 5, 13), 3, 0.15, 20, 130),
  ((2, 7, 11), 3, 0.15, 20, 154),
  ((3, 5, 13), 3, 0.143, 21, 195)],
 [((3, 5, 11, 13), 4, 0.125, 32, 2145),
  ((2, 7, 11, 13), 4, 0.121, 33, 2002),
  ((3, 5, 7, 19), 4, 0.118, 34, 1995),
  ((3, 7, 11, 13), 4, 0.118, 34, 3003),
  ((3, 5, 11, 17), 4, 0.111, 36, 2805)],
 [((11, 13, 19, 23), 4, 0.061, 66, 62491),
  ((11, 13, 17, 29), 4, 0.057, 70, 70499),
  ((11, 17, 19, 23), 4, 0.057, 70, 81719),
  ((7, 13, 23, 29), 4, 0.056, 72, 60697),
  ((7, 17, 19, 29), 4, 0.056, 72, 65569)],
 [((23, 31, 37, 43), 4, 0.03, 134, 1134383),
  ((23, 29, 37, 47), 4, 0.029, 136, 1159913),
  ((23, 29, 41, 43), 4, 0.029, 136, 1175921),
  ((17, 37, 41, 43), 4, 0.029, 138, 1108927),
  ((19, 29, 43, 47), 4, 0.029, 138, 1113571)])


from sparse_encoders import create_sparse_Nk_codes, all_multihot_primes


%%time 
codes = create_sparse_Nk_codes()

| Segments | code size | Vector Size | N | k |exec_time (sec) |  Matrix Size in Disk (MB):                | Sparse Matrix Size in Disk (MB): |code path
| 1 | 128 | (128, 17) | 17 | 2 | 0.001 | 0.00 | 0.00 | codes/utf8_sparse_codes-1_N-17_k-2_seg |
| 2 | 1984 | (1984, 24) | 24 | 3 | 0.003 | 0.05 | 0.05 | codes/utf8_sparse_codes-2_N-24_k-3_seg |
| 3 | 59328 | (59328, 37) | 37 | 4 | 0.059 | 2.09 | 2.04 | codes/utf8_sparse_codes-3_N-37_k-4_seg |
| 4 | 1107904 | (1107904, 45) | 45 | 5 | 1.024 | 47.55 | 47.55 | codes/utf8_sparse_codes-4_N-45_k-5_seg |
CPU times: user 1.01 s, sys: 75.7 ms, total: 1.09 s
Wall time: 1.09 s


%%time 
cp_codes = all_multihot_primes()

| {} | {} | {} | {} | {:.3f} | {:.2f} | {:.2f} | {} |
| 1 | 128 | (128, 19) | (3, 5, 11) | 0.001 | 0.00 | 0.00 | codes/utf8_coprime_codes-128_primes-(3, 5, 11)_1_seg |
| 2 | 1984 | (1984, 32) | (3, 5, 11, 13) | 0.001 | 0.06 | 0.07 | codes/utf8_coprime_codes-1984_primes-(3, 5, 11, 13)_2_seg |
| 3 | 59328 | (59328, 66) | (11, 13, 19, 23) | 0.034 | 3.73 | 2.04 | codes/utf8_coprime_codes-59328_primes-(11, 13, 19, 23)_3_seg |
| 4 | 1107904 | (1107904, 134) | (23, 31, 37, 43) | 0.637 | 141.58 | 38.04 | codes/utf8_coprime_codes-1107904_primes-(23, 31, 37, 43)_4_seg |
CPU times: user 549 ms, sys: 127 ms, total: 675 ms
Wall time: 673 ms


from sparse_encoders import create_choose_Nk_coprimes_codes, create_coprimes_choose_Nk_codes


%%time
nkcp = create_choose_Nk_coprimes_codes()

| Segments | code size | Vector Size | N | k | primes |exec_time (sec) |  Matrix Size in Disk (MB):                        | Sparse Matrix Size in Disk (MB): |code path
| 1 | 128 | (128, 32) | 17 | 2 | (3, 5, 7) | 0.001 | 0.00 | 0.01 | codes/utf8_N-17k-2-coprime_codes-128_primes-(3, 5, 7)_1_seg |
| 2 | 1984 | (1984, 48) | 24 | 3 | (5, 8, 11) | 0.002 | 0.09 | 0.10 | codes/utf8_N-24k-3-coprime_codes-1984_primes-(5, 8, 11)_2_seg |
| 3 | 59328 | (59328, 64) | 37 | 4 | (3, 5, 8, 11) | 0.061 | 3.62 | 4.07 | codes/utf8_N-37k-4-coprime_codes-59328_primes-(3, 5, 8, 11)_3_seg |
| 4 | 1107904 | (1107904, 96) | 45 | 5 | (3, 7, 11, 13, 17) | 1.381 | 101.43 | 95.09 | codes/utf8_N-45k-5-coprime_codes-1107904_primes-(3, 7, 11, 13, 17)_4_seg |
CPU times: user 1.31 s, sys: 140 ms, total: 1.45 s
Wall time: 1.44 s


%%time 
cpnk = create_coprimes_choose_Nk_codes()

| Segments | code size | Vector Size | N | k | primes |exec_time (sec) |  Matrix Size in Disk (MB):                        | Sparse Matrix Size in Disk (MB): |code path
| 1 | 128 | (128, 32) | 13 | 3 | (3, 5, 11) | 0.002 | 0.00 | 0.01 | codes/utf8_coprime_codes-128_primes-(3, 5, 11)_N-13k-3_1-seg |
| 2 | 1984 | (1984, 64) | 32 | 3 | (3, 5, 11, 13) | 0.007 | 0.12 | 0.12 | codes/utf8_coprime_codes-1984_primes-(3, 5, 11, 13)_N-32k-3_2-seg |
| 3 | 59328 | (59328, 96) | 30 | 4 | (11, 13, 19, 23) | 0.057 | 5.43 | 2.98 | codes/utf8_coprime_codes-59328_primes-(11, 13, 19, 23)_N-30k-4_3-seg |
| 4 | 1107904 | (1107904, 192) | 58 | 5 | (23, 31, 37, 43) | 3.375 | 202.86 | 85.58 | codes/utf8_coprime_codes-1107904_primes-(23, 31, 37, 43)_N-58k-5_4-seg |
CPU times: user 3.09 s, sys: 328 ms, total: 3.41 s
Wall time: 3.44 s


import ntpath
import os

unicode_wiki_sites = [
    "https://en.wikipedia.org/wiki/List_of_Unicode_characters",
    "https://en.wikipedia.org/wiki/Armenian_(Unicode_block)",
    "https://en.wikipedia.org/wiki/Glagolitic_(Unicode_block)",
    "https://en.wikipedia.org/wiki/Cyrillic_(Unicode_block)",
    "https://en.wikipedia.org/wiki/Cyrillic_Supplement",
    "https://en.wikipedia.org/wiki/Cyrillic_Extended-A",
    "https://en.wikipedia.org/wiki/Cyrillic_Extended-B",
    "https://en.wikipedia.org/wiki/Cyrillic_Extended-C",
    "https://en.wikipedia.org/wiki/Greek_and_Coptic",
    "https://en.wikipedia.org/wiki/Phonetic_symbols_in_Unicode",
    "https://en.wikipedia.org/wiki/Coptic_Epact_Numbers",
    "https://en.wikipedia.org/wiki/Coptic_(Unicode_block)",
    "https://en.wikipedia.org/wiki/Emoticons_(Unicode_block)",
    "https://en.wikipedia.org/wiki/Unicode_and_HTML_for_the_Hebrew_alphabet",
    "https://en.wikipedia.org/wiki/Thaana_(Unicode_block)",
    "https://en.wikipedia.org/wiki/Arabic_script_in_Unicode",
]
cmd = "lynx -dump -nolist '{}' > ./text/wiki-unicode/{}.txt"

for site in unicode_wiki_sites:
    head, tail = ntpath.split(site)
    tail = tail.replace('(','').replace(')','')
    print(tail)
    command = cmd.format(site, tail)
    print(command)
    os.system(command)

List_of_Unicode_characters
lynx -dump -nolist 'https://en.wikipedia.org/wiki/List_of_Unicode_characters' > ./text/wiki-unicode/List_of_Unicode_characters.txt
Armenian_Unicode_block
lynx -dump -nolist 'https://en.wikipedia.org/wiki/Armenian_(Unicode_block)' > ./text/wiki-unicode/Armenian_Unicode_block.txt
Glagolitic_Unicode_block
lynx -dump -nolist 'https://en.wikipedia.org/wiki/Glagolitic_(Unicode_block)' > ./text/wiki-unicode/Glagolitic_Unicode_block.txt
Cyrillic_Unicode_block
lynx -dump -nolist 'https://en.wikipedia.org/wiki/Cyrillic_(Unicode_block)' > ./text/wiki-unicode/Cyrillic_Unicode_block.txt
Cyrillic_Supplement
lynx -dump -nolist 'https://en.wikipedia.org/wiki/Cyrillic_Supplement' > ./text/wiki-unicode/Cyrillic_Supplement.txt
Cyrillic_Extended-A
lynx -dump -nolist 'https://en.wikipedia.org/wiki/Cyrillic_Extended-A' > ./text/wiki-unicode/Cyrillic_Extended-A.txt
Cyrillic_Extended-B
lynx -dump -nolist 'https://en.wikipedia.org/wiki/Cyrillic_Extended-B' > ./text/wiki-unicode/Cyrillic_Extended-B.txt
Cyrillic_Extended-C
lynx -dump -nolist 'https://en.wikipedia.org/wiki/Cyrillic_Extended-C' > ./text/wiki-unicode/Cyrillic_Extended-C.txt
Greek_and_Coptic
lynx -dump -nolist 'https://en.wikipedia.org/wiki/Greek_and_Coptic' > ./text/wiki-unicode/Greek_and_Coptic.txt
Phonetic_symbols_in_Unicode
lynx -dump -nolist 'https://en.wikipedia.org/wiki/Phonetic_symbols_in_Unicode' > ./text/wiki-unicode/Phonetic_symbols_in_Unicode.txt
Coptic_Epact_Numbers
lynx -dump -nolist 'https://en.wikipedia.org/wiki/Coptic_Epact_Numbers' > ./text/wiki-unicode/Coptic_Epact_Numbers.txt
Coptic_Unicode_block
lynx -dump -nolist 'https://en.wikipedia.org/wiki/Coptic_(Unicode_block)' > ./text/wiki-unicode/Coptic_Unicode_block.txt
Emoticons_Unicode_block
lynx -dump -nolist 'https://en.wikipedia.org/wiki/Emoticons_(Unicode_block)' > ./text/wiki-unicode/Emoticons_Unicode_block.txt
Unicode_and_HTML_for_the_Hebrew_alphabet
lynx -dump -nolist 'https://en.wikipedia.org/wiki/Unicode_and_HTML_for_the_Hebrew_alphabet' > ./text/wiki-unicode/Unicode_and_HTML_for_the_Hebrew_alphabet.txt
Thaana_Unicode_block
lynx -dump -nolist 'https://en.wikipedia.org/wiki/Thaana_(Unicode_block)' > ./text/wiki-unicode/Thaana_Unicode_block.txt
Arabic_script_in_Unicode
lynx -dump -nolist 'https://en.wikipedia.org/wiki/Arabic_script_in_Unicode' > ./text/wiki-unicode/Arabic_script_in_Unicode.txt


from preprocessors.symbol_extraction import *


selected_path = 'selected_sources_small'


%%time
extract_all_chars(selected_path)

CPU times: user 27.4 ms, sys: 0 ns, total: 27.4 ms
Wall time: 24.2 ms


sclist = char_stats(selected_path)

2899 new chars on file selected_sources_small/List_of_Unicode_characters.txt 
48 new chars on file selected_sources_small/Cyrillic_Supplement.txt 
3 new chars on file selected_sources_small/Greek_and_Coptic.txt 
90 new chars on file selected_sources_small/Unicode_and_HTML_for_the_Hebrew_alphabet.txt 
0 new chars on file selected_sources_small/Cyrillic_Unicode_block.txt 
Total number of symbols: 3040
Number of Selected Symbols: 1569


%%time
extract_all_chars(selected_path)
sclist = char_stats(selected_path)

2899 new chars on file selected_sources_small/List_of_Unicode_characters.txt 
48 new chars on file selected_sources_small/Cyrillic_Supplement.txt 
3 new chars on file selected_sources_small/Greek_and_Coptic.txt 
90 new chars on file selected_sources_small/Unicode_and_HTML_for_the_Hebrew_alphabet.txt 
0 new chars on file selected_sources_small/Cyrillic_Unicode_block.txt 
Total number of symbols: 3040
Number of Selected Symbols: 1569
CPU times: user 19.7 ms, sys: 3.91 ms, total: 23.6 ms
Wall time: 22.6 ms


import pickle

path = '../..//minibrain/predictors/sequence/text/utf8-codes/txt2num_2seg.pkl'

txt2num = pickle.load(open(path, 'rb'))


utf8chars_2seg = txt2num.keys()


ccodes = set(list(utf8chars_2seg)[:32])


sclist = char_stats(selected_path)

2899 new chars on file selected_sources_small/List_of_Unicode_characters.txt 
48 new chars on file selected_sources_small/Cyrillic_Supplement.txt 
3 new chars on file selected_sources_small/Greek_and_Coptic.txt 
90 new chars on file selected_sources_small/Unicode_and_HTML_for_the_Hebrew_alphabet.txt 
0 new chars on file selected_sources_small/Cyrillic_Unicode_block.txt 
Total number of symbols: 3040
Number of Selected Symbols: 1569


all_codes = set(sclist)
all_codes.update(ccodes)


all_codes = sorted(list(all_codes))
len(all_codes)

1601


from sparse_encoders import create_codematrix_from_conf
# generating codeblock and code dict 
# config = [(2, 1916, (24, 3), (3, 5, 11, 13), (6, 2), 64, 9 / 64)]
# config = [(2, 2112, (24, 3), (3, 5, 11, 13), (6, 2), 64, 9 / 64)]
config = [(2, 2112, (24, 3), (3, 5, 11, 13), (4, 6, 8, 10, 12), 96, 13 / 96)]
create_codematrix_from_conf(config)

| Segments | code size | Vector Size | N | k | primes | cycles | exec_time (sec) |  Matrix Size in Disk (MB):                            | Sparse Matrix Size in Disk (MB): |code path
| 2 | 2112 | (2112, 96) | 24 | 3 | (3, 5, 11, 13) | (4, 6, 8, 10, 12) | 0.012 | 0.19 | 0.14 | codes/utf8_2-seg_2112-codepoints_96-dim_N-24-k3_coprimes-(3, 5, 11, 13)_cycles-(4, 6, 8, 10, 12)_dense |

[array([[ True, False, False, ...,  True,  True,  True],
        [False,  True, False, ...,  True,  True,  True],
        [False, False,  True, ...,  True,  True,  True],
        ...,
        [ True, False, False, ..., False, False, False],
        [False,  True, False, ..., False, False, False],
        [False, False,  True, ..., False, False, False]])]


from collections import OrderedDict
codes_ids = OrderedDict(list(enumerate(all_codes)))


import pickle
fname = 'codes/adhoc-code-1916-codepoints.pkl'
with open(fname, 'wb') as f:
    pickle.dump(codes_ids, f, pickle.HIGHEST_PROTOCOL)


from sparse_encoders import create_codebook
from preprocessors.symbol_extraction import *


charset = char_stats(selected_path)

2899 new chars on file selected_sources_small/List_of_Unicode_characters.txt 
48 new chars on file selected_sources_small/Cyrillic_Supplement.txt 
3 new chars on file selected_sources_small/Greek_and_Coptic.txt 
90 new chars on file selected_sources_small/Unicode_and_HTML_for_the_Hebrew_alphabet.txt 
0 new chars on file selected_sources_small/Cyrillic_Unicode_block.txt 
Total number of symbols: 3040
Number of Selected Symbols: 1569


%%time
# all codebook and dictionaries are created with this function:
# CHARSET_PATH = "codes/all_chars.chars"
config = (2, 1838+33, (24, 3), (3, 5, 11, 13), (4, 6, 8, 10, 12), 96, 13 / 96)
ofname = "codes/adhoc-codebook-1871.pkl"
codebook = create_codebook(charset, config, ofname, reserved_spaces=33)

| Segments | code size | Vector Size | N | k | primes | cycles | exec_time (sec) |  Matrix Size in Disk (MB):                            | Sparse Matrix Size in Disk (MB): |code path
| 2 | 1871 | (1871, 96) | 24 | 3 | (3, 5, 11, 13) | (4, 6, 8, 10, 12) | 0.004 | 0.17 | 0.12 | codes/utf8_2-seg_1871-codepoints_96-dim_N-24-k3_coprimes-(3, 5, 11, 13)_cycles-(4, 6, 8, 10, 12)_dense |
saving file codes/adhoc-codebook-1871.pkl with codes.shape (1871, 96) | char2int 1609 | int2char 1602
CPU times: user 6.72 ms, sys: 599 µs, total: 7.32 ms
Wall time: 6.24 ms


code, char2int, int2char = codebook


code.shape

(1871, 96)


len(char2int.keys()), len(int2char.keys())

(1609, 1602)


set(char2int.keys()).difference(set(int2char.values()))

{'\x00', '\x01', '\x02', '\x03', '\x04', '\x15', '\x1a'}


set(int2char.keys()).difference(set(char2int.values())), set(char2int.values()).difference(set(int2char.keys()))

(set(), set())


int2char[10], int2char[9], int2char[33], int2char[32],

('\n', '\t', '!', ' ')


char2int['\n'], char2int['\t']

(10, 9)


max(int2char.keys()), max(char2int.values())

(1601, 1601)


int2char[32]

' '


char2int[' ']

32

Number of bytes	Bits for code point	First code point	Last code point	Byte 1	Byte 2	Byte 3	Byte 4
1	7	U+0000	U+007F	0xxxxxxx
2	11	U+0080	U+07FF	110xxxxx	10xxxxxx
3	16	U+0800	U+FFFF	1110xxxx	10xxxxxx	10xxxxxx
4	21	U+10000	U+10FFFF	11110xxx	10xxxxxx	10xxxxxx	10xxxxxx

Segment	# of code points	First index	Last index	Vector Size	# code exceptions	Size (MB)	Matrix Size (MB)	Sparse Size (MB)
4	1107904	61440	1107904	452	790656	11538.59	3820.59	83.59
3	59328	2112	61439	388	4224	530.71	175.62	3.59
2	1984	128	2111	324	128	14.84	4.90	0.09
1	128	0	127	260	0	0.77	0.25	0.005

DRAFT - V3¶

Flexible Universal Character Level Encoding methods for Text in NLP (FlexCodes)¶

A Study of Multiple Encoding Techniques for Low Resource Consumption Character Level Embeddings for NLP Tasks¶

Note [Nov 2022]¶

Abstract¶

Contributions¶

Notes¶

Introduction and Related Work¶

Character Level NLP¶

Notes¶

Sections¶

UTF-8 Analysis¶

One-Hot encoding¶

Number of code-points¶

UTF-8 structure and Encoding Details¶

Encoding details¶

UTF-8 Segments¶

Encoding Size Notes and Partial Conclusions¶

Signaling the Start and End of a Sequence¶

Encoding and Decoding Subwords composed by character level combinations¶

Encoding subwords and other character level combination types (like BPE)¶

Decoding subwords and other character level combination types (like BPE)¶

Codes Creation¶

Observations¶

Execution of previous results¶

Embedding Sizes¶

Execution Time¶

PC Configuration:¶

Hardware¶

Software¶

Overfitting Compression¶

Sparse Codes¶

Sparse Codes, Choosing k of N ${N\choose k}$¶

Sparse Codes, multiple joint co-prime codes¶

Code Generation¶

Combining sparse codes¶

Redundant Codes Methods¶

Single Cycle multi-one-hot Segmentation Method¶

Extra Notes on Mixed and redundant Codebooks¶

2 Segments Code¶

3 Segments Code¶

Preliminary Testing Decisions¶

Code:¶

Character Level Encoder:¶

Character Level Decoder¶

Rethinking What to Encode¶

Decoding¶

Decoding on Dedicated Hardware¶

Method Validation¶

Preliminary conclusions of the first iteration on this work¶

Kind of tokens to use - Ideas to explore¶

Discussion on the current status of NLP and ideas to take into account for the encoder¶

Tokenization discussion¶

Possible approaches for tokenization and extra information¶

Ideas¶

Conclusion¶

Future Work¶