nltk.tokenize.ToktokTokenizer¶

class nltk.tokenize.ToktokTokenizer[source]¶

Bases: TokenizerI

This is a Python port of the tok-tok.pl from https://github.com/jonsafari/tok-tok/blob/master/tok-tok.pl

>>> toktok = ToktokTokenizer()
>>> text = u'Is 9.5 or 525,600 my favorite number?'
>>> print(toktok.tokenize(text, return_str=True))
Is 9.5 or 525,600 my favorite number ?
>>> text = u'The https://github.com/jonsafari/tok-tok/blob/master/tok-tok.pl is a website with/and/or slashes and sort of weird : things'
>>> print(toktok.tokenize(text, return_str=True))
The https://github.com/jonsafari/tok-tok/blob/master/tok-tok.pl is a website with/and/or slashes and sort of weird : things
>>> text = u'¡This, is a sentence with weird» symbols… appearing everywhere¿'
>>> expected = u'¡ This , is a sentence with weird » symbols … appearing everywhere ¿'
>>> assert toktok.tokenize(text, return_str=True) == expected
>>> toktok.tokenize(text) == [u'¡', u'This', u',', u'is', u'a', u'sentence', u'with', u'weird', u'»', u'symbols', u'…', u'appearing', u'everywhere', u'¿']
True

NON_BREAKING = (re.compile('\xa0'), ' ')¶

FUNKY_PUNCT_1 = (re.compile('([،;؛¿!"\\])}»›”؟¡%٪°±©®।॥…])'), ' \\1 ')¶

FUNKY_PUNCT_2 = (re.compile('([({\\[“‘„‚«‹「『])'), ' \\1 ')¶

EN_EM_DASHES = (re.compile('([–—])'), ' \\1 ')¶

AMPERCENT = (re.compile('& '), '& ')¶

TAB = (re.compile('\t'), ' 	 ')¶

PIPE = (re.compile('\\|'), ' | ')¶

COMMA_IN_NUM = (re.compile('(?<!,)([,،])(?![,\\d])'), ' \\1 ')¶

PROB_SINGLE_QUOTES = (re.compile("(['’`])"), ' \\1 ')¶

STUPID_QUOTES_1 = (re.compile(' ` ` '), ' `` ')¶

STUPID_QUOTES_2 = (re.compile(" ' ' "), " '' ")¶

FINAL_PERIOD_1 = (re.compile('(?<!\\.)\\.$'), ' .')¶

FINAL_PERIOD_2 = (re.compile('(?<!\\.)\\.\\s*(["\'’»›”]) *$'), ' . \\1')¶

MULTI_COMMAS = (re.compile('(,{2,})'), ' \\1 ')¶

MULTI_DASHES = (re.compile('(-{2,})'), ' \\1 ')¶

MULTI_DOTS = (re.compile('(\\.{2,})'), ' \\1 ')¶

OPEN_PUNCT = '([{༺༼᚛‚„⁅⁽₍〈❨❪❬❮❰❲❴⟅⟦⟨⟪⟬⟮⦃⦅⦇⦉⦋⦍⦏⦑⦓⦕⦗⧘⧚⧼⸢⸤⸦⸨〈《「『【〔〖〘〚〝﴾︗︵︷︹︻︽︿﹁﹃﹇﹙﹛﹝（［｛｟｢'¶

CLOSE_PUNCT = ')]}༻༽᚜⁆⁾₎〉❩❫❭❯❱❳❵⟆⟧⟩⟫⟭⟯⦄⦆⦈⦊⦌⦎⦐⦒⦔⦖⦘⧙⧛⧽⸣⸥⸧⸩〉》」』】〕〗〙〛〞〟﴿︘︶︸︺︼︾﹀﹂﹄﹈﹚﹜﹞）］｝｠｣'¶

CURRENCY_SYM = '$¢£¤¥֏؋৲৳৻૱௹฿៛₠₡₢₣₤₥₦₧₨₩₪₫€₭₮₯₰₱₲₳₴₵₶₷₸₹₺꠸﷼﹩＄￠￡￥￦'¶

OPEN_PUNCT_RE = (re.compile('([([{༺༼᚛‚„⁅⁽₍〈❨❪❬❮❰❲❴⟅⟦⟨⟪⟬⟮⦃⦅⦇⦉⦋⦍⦏⦑⦓⦕⦗⧘⧚⧼⸢⸤⸦⸨〈《「『【〔〖〘〚〝﴾︗︵︷︹︻︽︿﹁﹃﹇﹙﹛﹝（［｛｟｢])'), '\\1 ')¶

CLOSE_PUNCT_RE = (re.compile('([)]}༻༽᚜⁆⁾₎〉❩❫❭❯❱❳❵⟆⟧⟩⟫⟭⟯⦄⦆⦈⦊⦌⦎⦐⦒⦔⦖⦘⧙⧛⧽⸣⸥⸧⸩〉》」』】〕〗〙〛〞〟﴿︘︶︸︺︼︾﹀﹂﹄﹈﹚﹜﹞）］｝｠｣])'), '\\1 ')¶

CURRENCY_SYM_RE = (re.compile('([$¢£¤¥֏؋৲৳৻૱௹฿៛₠₡₢₣₤₥₦₧₨₩₪₫€₭₮₯₰₱₲₳₴₵₶₷₸₹₺꠸﷼﹩＄￠￡￥￦])'), '\\1 ')¶

URL_FOE_1 = (re.compile(':(?!//)'), ' : ')¶

URL_FOE_2 = (re.compile('\\?(?!\\S)'), ' ? ')¶

URL_FOE_3 = (re.compile('(:\\/\\/)[\\S+\\.\\S+\\/\\S+][\\/]'), ' / ')¶

URL_FOE_4 = (re.compile(' /'), ' / ')¶

LSTRIP = (re.compile('^ +'), '')¶

RSTRIP = (re.compile('\\s+$'), '\n')¶

ONE_SPACE = (re.compile(' {2,}'), ' ')¶

TOKTOK_REGEXES = [(re.compile('\xa0'), ' '), (re.compile('([،;؛¿!"\\])}»›”؟¡%٪°±©®।॥…])'), ' \\1 '), (re.compile(':(?!//)'), ' : '), (re.compile('\\?(?!\\S)'), ' ? '), (re.compile('(:\\/\\/)[\\S+\\.\\S+\\/\\S+][\\/]'), ' / '), (re.compile(' /'), ' / '), (re.compile('& '), '& '), (re.compile('\t'), ' 	 '), (re.compile('\\|'), ' | '), (re.compile('([([{༺༼᚛‚„⁅⁽₍〈❨❪❬❮❰❲❴⟅⟦⟨⟪⟬⟮⦃⦅⦇⦉⦋⦍⦏⦑⦓⦕⦗⧘⧚⧼⸢⸤⸦⸨〈《「『【〔〖〘〚〝﴾︗︵︷︹︻︽︿﹁﹃﹇﹙﹛﹝（［｛｟｢])'), '\\1 '), (re.compile('([)]}༻༽᚜⁆⁾₎〉❩❫❭❯❱❳❵⟆⟧⟩⟫⟭⟯⦄⦆⦈⦊⦌⦎⦐⦒⦔⦖⦘⧙⧛⧽⸣⸥⸧⸩〉》」』】〕〗〙〛〞〟﴿︘︶︸︺︼︾﹀﹂﹄﹈﹚﹜﹞）］｝｠｣])'), '\\1 '), (re.compile('(,{2,})'), ' \\1 '), (re.compile('(?<!,)([,،])(?![,\\d])'), ' \\1 '), (re.compile('(?<!\\.)\\.\\s*(["\'’»›”]) *$'), ' . \\1'), (re.compile("(['’`])"), ' \\1 '), (re.compile(' ` ` '), ' `` '), (re.compile(" ' ' "), " '' "), (re.compile('([$¢£¤¥֏؋৲৳৻૱௹฿៛₠₡₢₣₤₥₦₧₨₩₪₫€₭₮₯₰₱₲₳₴₵₶₷₸₹₺꠸﷼﹩＄￠￡￥￦])'), '\\1 '), (re.compile('([–—])'), ' \\1 '), (re.compile('(-{2,})'), ' \\1 '), (re.compile('(\\.{2,})'), ' \\1 '), (re.compile('(?<!\\.)\\.$'), ' .'), (re.compile('(?<!\\.)\\.\\s*(["\'’»›”]) *$'), ' . \\1'), (re.compile(' {2,}'), ' ')]¶

tokenize(text, return_str=False)[source]¶

Return a tokenized copy of s.

Return type: List[str]

span_tokenize(s: str) → Iterator[Tuple[int, int]][source]¶

Identify the tokens using integer offsets (start_i, end_i), where s[start_i:end_i] is the corresponding token.

Return type: Iterator[Tuple[int, int]]
Parameters: s (str) –

span_tokenize_sents(strings: List[str]) → Iterator[List[Tuple[int, int]]][source]¶

Apply self.span_tokenize() to each element of strings. I.e.:

return [self.span_tokenize(s) for s in strings]

Yield: List[Tuple[int, int]]
Parameters: strings (List[str]) –
Return type: Iterator[List[Tuple[int, int]]]

tokenize_sents(strings: List[str]) → List[List[str]][source]¶

Apply self.tokenize() to each element of strings. I.e.:

return [self.tokenize(s) for s in strings]

Return type: List[List[str]]
Parameters: strings (List[str]) –

NLTK

Documentation

nltk.tokenize.ToktokTokenizer¶