nltk.tokenize.ToktokTokenizer

class nltk.tokenize.ToktokTokenizer[source]

Bases: TokenizerI

This is a Python port of the tok-tok.pl from https://github.com/jonsafari/tok-tok/blob/master/tok-tok.pl

>>> toktok = ToktokTokenizer()
>>> text = u'Is 9.5 or 525,600 my favorite number?'
>>> print(toktok.tokenize(text, return_str=True))
Is 9.5 or 525,600 my favorite number ?
>>> text = u'The https://github.com/jonsafari/tok-tok/blob/master/tok-tok.pl is a website with/and/or slashes and sort of weird : things'
>>> print(toktok.tokenize(text, return_str=True))
The https://github.com/jonsafari/tok-tok/blob/master/tok-tok.pl is a website with/and/or slashes and sort of weird : things
>>> text = u'¡This, is a sentence with weird» symbols… appearing everywhere¿'
>>> expected = u'¡ This , is a sentence with weird » symbols … appearing everywhere ¿'
>>> assert toktok.tokenize(text, return_str=True) == expected
>>> toktok.tokenize(text) == [u'¡', u'This', u',', u'is', u'a', u'sentence', u'with', u'weird', u'»', u'symbols', u'…', u'appearing', u'everywhere', u'¿']
True
NON_BREAKING = (re.compile('\xa0'), ' ')
FUNKY_PUNCT_1 = (re.compile('([،;؛¿!"\\])}»›”؟¡%٪°±©®।॥…])'), ' \\1 ')
FUNKY_PUNCT_2 = (re.compile('([({\\[“‘„‚«‹「『])'), ' \\1 ')
EN_EM_DASHES = (re.compile('([–—])'), ' \\1 ')
AMPERCENT = (re.compile('& '), '& ')
TAB = (re.compile('\t'), ' 	 ')
PIPE = (re.compile('\\|'), ' | ')
COMMA_IN_NUM = (re.compile('(?<!,)([,،])(?![,\\d])'), ' \\1 ')
PROB_SINGLE_QUOTES = (re.compile("(['’`])"), ' \\1 ')
STUPID_QUOTES_1 = (re.compile(' ` ` '), ' `` ')
STUPID_QUOTES_2 = (re.compile(" ' ' "), " '' ")
FINAL_PERIOD_1 = (re.compile('(?<!\\.)\\.$'), ' .')
FINAL_PERIOD_2 = (re.compile('(?<!\\.)\\.\\s*(["\'’»›”]) *$'), ' . \\1')
MULTI_COMMAS = (re.compile('(,{2,})'), ' \\1 ')
MULTI_DASHES = (re.compile('(-{2,})'), ' \\1 ')
MULTI_DOTS = (re.compile('(\\.{2,})'), ' \\1 ')
OPEN_PUNCT = '([{༺༼᚛‚„⁅⁽₍〈❨❪❬❮❰❲❴⟅⟦⟨⟪⟬⟮⦃⦅⦇⦉⦋⦍⦏⦑⦓⦕⦗⧘⧚⧼⸢⸤⸦⸨〈《「『【〔〖〘〚〝﴾︗︵︷︹︻︽︿﹁﹃﹇﹙﹛﹝([{⦅「'
CLOSE_PUNCT = ')]}༻༽᚜⁆⁾₎〉❩❫❭❯❱❳❵⟆⟧⟩⟫⟭⟯⦄⦆⦈⦊⦌⦎⦐⦒⦔⦖⦘⧙⧛⧽⸣⸥⸧⸩〉》」』】〕〗〙〛〞〟﴿︘︶︸︺︼︾﹀﹂﹄﹈﹚﹜﹞)]}⦆」'
CURRENCY_SYM = '$¢£¤¥֏؋৲৳৻૱௹฿៛₠₡₢₣₤₥₦₧₨₩₪₫€₭₮₯₰₱₲₳₴₵₶₷₸₹₺꠸﷼﹩$¢£¥₩'
OPEN_PUNCT_RE = (re.compile('([([{༺༼᚛‚„⁅⁽₍〈❨❪❬❮❰❲❴⟅⟦⟨⟪⟬⟮⦃⦅⦇⦉⦋⦍⦏⦑⦓⦕⦗⧘⧚⧼⸢⸤⸦⸨〈《「『【〔〖〘〚〝﴾︗︵︷︹︻︽︿﹁﹃﹇﹙﹛﹝([{⦅「])'), '\\1 ')
CLOSE_PUNCT_RE = (re.compile('([)]}༻༽᚜⁆⁾₎〉❩❫❭❯❱❳❵⟆⟧⟩⟫⟭⟯⦄⦆⦈⦊⦌⦎⦐⦒⦔⦖⦘⧙⧛⧽⸣⸥⸧⸩〉》」』】〕〗〙〛〞〟﴿︘︶︸︺︼︾﹀﹂﹄﹈﹚﹜﹞)]}⦆」])'), '\\1 ')
CURRENCY_SYM_RE = (re.compile('([$¢£¤¥֏؋৲৳৻૱௹฿៛₠₡₢₣₤₥₦₧₨₩₪₫€₭₮₯₰₱₲₳₴₵₶₷₸₹₺꠸﷼﹩$¢£¥₩])'), '\\1 ')
URL_FOE_1 = (re.compile(':(?!//)'), ' : ')
URL_FOE_2 = (re.compile('\\?(?!\\S)'), ' ? ')
URL_FOE_3 = (re.compile('(:\\/\\/)[\\S+\\.\\S+\\/\\S+][\\/]'), ' / ')
URL_FOE_4 = (re.compile(' /'), ' / ')
LSTRIP = (re.compile('^ +'), '')
RSTRIP = (re.compile('\\s+$'), '\n')
ONE_SPACE = (re.compile(' {2,}'), ' ')
TOKTOK_REGEXES = [(re.compile('\xa0'), ' '), (re.compile('([،;؛¿!"\\])}»›”؟¡%٪°±©®।॥…])'), ' \\1 '), (re.compile(':(?!//)'), ' : '), (re.compile('\\?(?!\\S)'), ' ? '), (re.compile('(:\\/\\/)[\\S+\\.\\S+\\/\\S+][\\/]'), ' / '), (re.compile(' /'), ' / '), (re.compile('& '), '&amp; '), (re.compile('\t'), ' &#9; '), (re.compile('\\|'), ' &#124; '), (re.compile('([([{༺༼᚛‚„⁅⁽₍〈❨❪❬❮❰❲❴⟅⟦⟨⟪⟬⟮⦃⦅⦇⦉⦋⦍⦏⦑⦓⦕⦗⧘⧚⧼⸢⸤⸦⸨〈《「『【〔〖〘〚〝﴾︗︵︷︹︻︽︿﹁﹃﹇﹙﹛﹝([{⦅「])'), '\\1 '), (re.compile('([)]}༻༽᚜⁆⁾₎〉❩❫❭❯❱❳❵⟆⟧⟩⟫⟭⟯⦄⦆⦈⦊⦌⦎⦐⦒⦔⦖⦘⧙⧛⧽⸣⸥⸧⸩〉》」』】〕〗〙〛〞〟﴿︘︶︸︺︼︾﹀﹂﹄﹈﹚﹜﹞)]}⦆」])'), '\\1 '), (re.compile('(,{2,})'), ' \\1 '), (re.compile('(?<!,)([,،])(?![,\\d])'), ' \\1 '), (re.compile('(?<!\\.)\\.\\s*(["\'’»›”]) *$'), ' . \\1'), (re.compile("(['’`])"), ' \\1 '), (re.compile(' ` ` '), ' `` '), (re.compile(" ' ' "), " '' "), (re.compile('([$¢£¤¥֏؋৲৳৻૱௹฿៛₠₡₢₣₤₥₦₧₨₩₪₫€₭₮₯₰₱₲₳₴₵₶₷₸₹₺꠸﷼﹩$¢£¥₩])'), '\\1 '), (re.compile('([–—])'), ' \\1 '), (re.compile('(-{2,})'), ' \\1 '), (re.compile('(\\.{2,})'), ' \\1 '), (re.compile('(?<!\\.)\\.$'), ' .'), (re.compile('(?<!\\.)\\.\\s*(["\'’»›”]) *$'), ' . \\1'), (re.compile(' {2,}'), ' ')]
tokenize(text, return_str=False)[source]

Return a tokenized copy of s.

Return type

List[str]

span_tokenize(s: str) Iterator[Tuple[int, int]][source]

Identify the tokens using integer offsets (start_i, end_i), where s[start_i:end_i] is the corresponding token.

Return type

Iterator[Tuple[int, int]]

Parameters

s (str) –

span_tokenize_sents(strings: List[str]) Iterator[List[Tuple[int, int]]][source]

Apply self.span_tokenize() to each element of strings. I.e.:

return [self.span_tokenize(s) for s in strings]

Yield

List[Tuple[int, int]]

Parameters

strings (List[str]) –

Return type

Iterator[List[Tuple[int, int]]]

tokenize_sents(strings: List[str]) List[List[str]][source]

Apply self.tokenize() to each element of strings. I.e.:

return [self.tokenize(s) for s in strings]

Return type

List[List[str]]

Parameters

strings (List[str]) –