nltk.parse.CoreNLPDependencyParser

class nltk.parse.CoreNLPDependencyParser[source]

Bases: GenericCoreNLPParser

Dependency parser.

>>> dep_parser = CoreNLPDependencyParser(url='http://localhost:9000')
>>> parse, = dep_parser.raw_parse(
...     'The quick brown fox jumps over the lazy dog.'
... )
>>> print(parse.to_conll(4))  
The     DT      4       det
quick   JJ      4       amod
brown   JJ      4       amod
fox     NN      5       nsubj
jumps   VBZ     0       ROOT
over    IN      9       case
the     DT      9       det
lazy    JJ      9       amod
dog     NN      5       nmod
.       .       5       punct
>>> print(parse.tree())  
(jumps (fox The quick brown) (dog over the lazy) .)
>>> for governor, dep, dependent in parse.triples():
...     print(governor, dep, dependent)  
    ('jumps', 'VBZ') nsubj ('fox', 'NN')
    ('fox', 'NN') det ('The', 'DT')
    ('fox', 'NN') amod ('quick', 'JJ')
    ('fox', 'NN') amod ('brown', 'JJ')
    ('jumps', 'VBZ') nmod ('dog', 'NN')
    ('dog', 'NN') case ('over', 'IN')
    ('dog', 'NN') det ('the', 'DT')
    ('dog', 'NN') amod ('lazy', 'JJ')
    ('jumps', 'VBZ') punct ('.', '.')
>>> (parse_fox, ), (parse_dog, ) = dep_parser.raw_parse_sents(
...     [
...         'The quick brown fox jumps over the lazy dog.',
...         'The quick grey wolf jumps over the lazy fox.',
...     ]
... )
>>> print(parse_fox.to_conll(4))  
The DT      4       det
quick       JJ      4       amod
brown       JJ      4       amod
fox NN      5       nsubj
jumps       VBZ     0       ROOT
over        IN      9       case
the DT      9       det
lazy        JJ      9       amod
dog NN      5       nmod
.   .       5       punct
>>> print(parse_dog.to_conll(4))  
The DT      4       det
quick       JJ      4       amod
grey        JJ      4       amod
wolf        NN      5       nsubj
jumps       VBZ     0       ROOT
over        IN      9       case
the DT      9       det
lazy        JJ      9       amod
fox NN      5       nmod
.   .       5       punct
>>> (parse_dog, ), (parse_friends, ) = dep_parser.parse_sents(
...     [
...         "I 'm a dog".split(),
...         "This is my friends ' cat ( the tabby )".split(),
...     ]
... )
>>> print(parse_dog.to_conll(4))  
I   PRP     4       nsubj
'm  VBP     4       cop
a   DT      4       det
dog NN      0       ROOT
>>> print(parse_friends.to_conll(4))  
This        DT      6       nsubj
is  VBZ     6       cop
my  PRP$    4       nmod:poss
friends     NNS     6       nmod:poss
'   POS     4       case
cat NN      0       ROOT
-LRB-       -LRB-   9       punct
the DT      9       det
tabby       NN      6       appos
-RRB-       -RRB-   9       punct
>>> parse_john, parse_mary, = dep_parser.parse_text(
...     'John loves Mary. Mary walks.'
... )
>>> print(parse_john.to_conll(4))  
John        NNP     2       nsubj
loves       VBZ     0       ROOT
Mary        NNP     2       dobj
.   .       2       punct
>>> print(parse_mary.to_conll(4))  
Mary        NNP     2       nsubj
walks       VBZ     0       ROOT
.   .       2       punct

Special cases

Non-breaking space inside of a token.

>>> len(
...     next(
...         dep_parser.raw_parse(
...             'Anhalt said children typically treat a 20-ounce soda bottle as one '
...             'serving, while it actually contains 2 1/2 servings.'
...         )
...     ).nodes
... )
21

Phone numbers.

>>> len(
...     next(
...         dep_parser.raw_parse('This is not going to crash: 01 111 555.')
...     ).nodes
... )
10
>>> print(
...     next(
...         dep_parser.raw_parse('The underscore _ should not simply disappear.')
...     ).to_conll(4)
... )  
The         DT  3   det
underscore  VBP 3   amod
_           NN  7   nsubj
should      MD  7   aux
not         RB  7   neg
simply      RB  7   advmod
disappear   VB  0   ROOT
.           .   7   punct
>>> print(
...     '\n'.join(
...         next(
...             dep_parser.raw_parse(
...                 'for all of its insights into the dream world of teen life , and its electronic expression through '
...                 'cyber culture , the film gives no quarter to anyone seeking to pull a cohesive story out of its 2 '
...                 '1/2-hour running time .'
...             )
...         ).to_conll(4).split('\n')[-8:]
...     )
... )
its PRP$    40      nmod:poss
2 1/2       CD      40      nummod
-   :       40      punct
hour        NN      31      nmod
running     VBG     42      amod
time        NN      40      dep
.   .       24      punct
parser_annotator = 'depparse'
make_tree(result)[source]
__init__(url='http://localhost:9000', encoding='utf8', tagtype=None)[source]
accuracy(gold)[source]

Score the accuracy of the tagger against the gold standard. Strip the tags from the gold standard text, retag it using the tagger, then compute the accuracy score.

Parameters

gold (list(list(tuple(str, str)))) – The list of tagged sentences to score the tagger on.

Return type

float

api_call(data, properties=None, timeout=60)[source]
confusion(gold)[source]

Return a ConfusionMatrix with the tags from gold as the reference values, with the predictions from tag_sents as the predicted values.

>>> from nltk.tag import PerceptronTagger
>>> from nltk.corpus import treebank
>>> tagger = PerceptronTagger()
>>> gold_data = treebank.tagged_sents()[:10]
>>> print(tagger.confusion(gold_data))
       |        -                                                                                     |
       |        N                                                                                     |
       |        O                                               P                                     |
       |        N                       J  J        N  N  P  P  R     R           V  V  V  V  V  W    |
       |  '     E     C  C  D  E  I  J  J  J  M  N  N  N  O  R  P  R  B  R  T  V  B  B  B  B  B  D  ` |
       |  '  ,  -  .  C  D  T  X  N  J  R  S  D  N  P  S  S  P  $  B  R  P  O  B  D  G  N  P  Z  T  ` |
-------+----------------------------------------------------------------------------------------------+
    '' | <1> .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  . |
     , |  .<15> .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  . |
-NONE- |  .  . <.> .  .  2  .  .  .  2  .  .  .  5  1  .  .  .  .  2  .  .  .  .  .  .  .  .  .  .  . |
     . |  .  .  .<10> .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  . |
    CC |  .  .  .  . <1> .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  . |
    CD |  .  .  .  .  . <5> .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  . |
    DT |  .  .  .  .  .  .<20> .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  . |
    EX |  .  .  .  .  .  .  . <1> .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  . |
    IN |  .  .  .  .  .  .  .  .<22> .  .  .  .  .  .  .  .  .  .  3  .  .  .  .  .  .  .  .  .  .  . |
    JJ |  .  .  .  .  .  .  .  .  .<16> .  .  .  .  1  .  .  .  .  1  .  .  .  .  .  .  .  .  .  .  . |
   JJR |  .  .  .  .  .  .  .  .  .  . <.> .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  . |
   JJS |  .  .  .  .  .  .  .  .  .  .  . <1> .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  . |
    MD |  .  .  .  .  .  .  .  .  .  .  .  . <1> .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  . |
    NN |  .  .  .  .  .  .  .  .  .  .  .  .  .<28> 1  1  .  .  .  .  .  .  .  .  .  .  .  .  .  .  . |
   NNP |  .  .  .  .  .  .  .  .  .  .  .  .  .  .<25> .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  . |
   NNS |  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .<19> .  .  .  .  .  .  .  .  .  .  .  .  .  .  . |
   POS |  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  . <1> .  .  .  .  .  .  .  .  .  .  .  .  .  . |
   PRP |  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  . <4> .  .  .  .  .  .  .  .  .  .  .  .  . |
  PRP$ |  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  . <2> .  .  .  .  .  .  .  .  .  .  .  . |
    RB |  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  . <4> .  .  .  .  .  .  .  .  .  .  . |
   RBR |  .  .  .  .  .  .  .  .  .  .  1  .  .  .  .  .  .  .  .  . <1> .  .  .  .  .  .  .  .  .  . |
    RP |  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  . <1> .  .  .  .  .  .  .  .  . |
    TO |  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  . <5> .  .  .  .  .  .  .  . |
    VB |  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  . <3> .  .  .  .  .  .  . |
   VBD |  .  .  .  .  .  .  .  .  .  .  .  .  .  1  .  .  .  .  .  .  .  .  .  . <6> .  .  .  .  .  . |
   VBG |  .  .  .  .  .  .  .  .  .  .  .  .  .  1  .  .  .  .  .  .  .  .  .  .  . <4> .  .  .  .  . |
   VBN |  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  1  . <4> .  .  .  . |
   VBP |  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  . <3> .  .  . |
   VBZ |  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  . <7> .  . |
   WDT |  .  .  .  .  .  .  .  .  2  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  . <.> . |
    `` |  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  . <1>|
-------+----------------------------------------------------------------------------------------------+
(row = reference; col = test)
Parameters

gold (list(list(tuple(str, str)))) – The list of tagged sentences to run the tagger with, also used as the reference values in the generated confusion matrix.

Return type

ConfusionMatrix

evaluate(**kwargs)[source]

@deprecated: Use accuracy(gold) instead.

evaluate_per_tag(gold, alpha=0.5, truncate=None, sort_by_count=False)[source]

Tabulate the recall, precision and f-measure for each tag from gold or from running tag on the tokenized sentences from gold.

>>> from nltk.tag import PerceptronTagger
>>> from nltk.corpus import treebank
>>> tagger = PerceptronTagger()
>>> gold_data = treebank.tagged_sents()[:10]
>>> print(tagger.evaluate_per_tag(gold_data))
   Tag | Prec.  | Recall | F-measure
-------+--------+--------+-----------
    '' | 1.0000 | 1.0000 | 1.0000
     , | 1.0000 | 1.0000 | 1.0000
-NONE- | 0.0000 | 0.0000 | 0.0000
     . | 1.0000 | 1.0000 | 1.0000
    CC | 1.0000 | 1.0000 | 1.0000
    CD | 0.7143 | 1.0000 | 0.8333
    DT | 1.0000 | 1.0000 | 1.0000
    EX | 1.0000 | 1.0000 | 1.0000
    IN | 0.9167 | 0.8800 | 0.8980
    JJ | 0.8889 | 0.8889 | 0.8889
   JJR | 0.0000 | 0.0000 | 0.0000
   JJS | 1.0000 | 1.0000 | 1.0000
    MD | 1.0000 | 1.0000 | 1.0000
    NN | 0.8000 | 0.9333 | 0.8615
   NNP | 0.8929 | 1.0000 | 0.9434
   NNS | 0.9500 | 1.0000 | 0.9744
   POS | 1.0000 | 1.0000 | 1.0000
   PRP | 1.0000 | 1.0000 | 1.0000
  PRP$ | 1.0000 | 1.0000 | 1.0000
    RB | 0.4000 | 1.0000 | 0.5714
   RBR | 1.0000 | 0.5000 | 0.6667
    RP | 1.0000 | 1.0000 | 1.0000
    TO | 1.0000 | 1.0000 | 1.0000
    VB | 1.0000 | 1.0000 | 1.0000
   VBD | 0.8571 | 0.8571 | 0.8571
   VBG | 1.0000 | 0.8000 | 0.8889
   VBN | 1.0000 | 0.8000 | 0.8889
   VBP | 1.0000 | 1.0000 | 1.0000
   VBZ | 1.0000 | 1.0000 | 1.0000
   WDT | 0.0000 | 0.0000 | 0.0000
    `` | 1.0000 | 1.0000 | 1.0000
Parameters
  • gold (list(list(tuple(str, str)))) – The list of tagged sentences to score the tagger on.

  • alpha (float) – Ratio of the cost of false negative compared to false positives, as used in the f-measure computation. Defaults to 0.5, where the costs are equal.

  • truncate (int, optional) – If specified, then only show the specified number of values. Any sorting (e.g., sort_by_count) will be performed before truncation. Defaults to None

  • sort_by_count (bool, optional) – Whether to sort the outputs on number of occurrences of that tag in the gold data, defaults to False

Returns

A tabulated recall, precision and f-measure string

Return type

str

f_measure(gold, alpha=0.5)[source]

Compute the f-measure for each tag from gold or from running tag on the tokenized sentences from gold. Then, return the dictionary with mappings from tag to f-measure. The f-measure is the harmonic mean of the precision and recall, weighted by alpha. In particular, given the precision p and recall r defined by:

  • p = true positive / (true positive + false negative)

  • r = true positive / (true positive + false positive)

The f-measure is:

  • 1/(alpha/p + (1-alpha)/r)

With alpha = 0.5, this reduces to:

  • 2pr / (p + r)

Parameters
  • gold (list(list(tuple(str, str)))) – The list of tagged sentences to score the tagger on.

  • alpha (float) – Ratio of the cost of false negative compared to false positives. Defaults to 0.5, where the costs are equal.

Returns

A mapping from tags to precision

Return type

Dict[str, float]

grammar()[source]
Returns

The grammar used by this parser.

parse(sent, *args, **kwargs)[source]
Returns

An iterator that generates parse trees for the sentence. When possible this list is sorted from most likely to least likely.

Parameters

sent (list(str)) – The sentence to be parsed

Return type

iter(Tree)

parse_all(sent, *args, **kwargs)[source]
Return type

list(Tree)

parse_one(sent, *args, **kwargs)[source]
Return type

Tree or None

parse_sents(sentences, *args, **kwargs)[source]

Parse multiple sentences.

Takes multiple sentences as a list where each sentence is a list of words. Each sentence will be automatically tagged with this CoreNLPParser instance’s tagger.

If a whitespace exists inside a token, then the token will be treated as several tokens.

Parameters

sentences (list(list(str))) – Input sentences to parse

Return type

iter(iter(Tree))

parse_text(text, *args, **kwargs)[source]

Parse a piece of text.

The text might contain several sentences which will be split by CoreNLP.

Parameters

text (str) – text to be split.

Returns

an iterable of syntactic structures. # TODO: should it be an iterable of iterables?

precision(gold)[source]

Compute the precision for each tag from gold or from running tag on the tokenized sentences from gold. Then, return the dictionary with mappings from tag to precision. The precision is defined as:

  • p = true positive / (true positive + false negative)

Parameters

gold (list(list(tuple(str, str)))) – The list of tagged sentences to score the tagger on.

Returns

A mapping from tags to precision

Return type

Dict[str, float]

raw_parse(sentence, properties=None, *args, **kwargs)[source]

Parse a sentence.

Takes a sentence as a string; before parsing, it will be automatically tokenized and tagged by the CoreNLP Parser.

Parameters

sentence (str) – Input sentence to parse

Return type

iter(Tree)

raw_parse_sents(sentences, verbose=False, properties=None, *args, **kwargs)[source]

Parse multiple sentences.

Takes multiple sentences as a list of strings. Each sentence will be automatically tokenized and tagged.

Parameters

sentences (list(str)) – Input sentences to parse.

Return type

iter(iter(Tree))

raw_tag_sents(sentences)[source]

Tag multiple sentences.

Takes multiple sentences as a list where each sentence is a string.

Parameters

sentences (list(str)) – Input sentences to tag

Return type

list(list(list(tuple(str, str)))

recall(gold) Dict[str, float][source]

Compute the recall for each tag from gold or from running tag on the tokenized sentences from gold. Then, return the dictionary with mappings from tag to recall. The recall is defined as:

  • r = true positive / (true positive + false positive)

Parameters

gold (list(list(tuple(str, str)))) – The list of tagged sentences to score the tagger on.

Returns

A mapping from tags to recall

Return type

Dict[str, float]

span_tokenize(s: str) Iterator[Tuple[int, int]][source]

Identify the tokens using integer offsets (start_i, end_i), where s[start_i:end_i] is the corresponding token.

Return type

Iterator[Tuple[int, int]]

Parameters

s (str) –

span_tokenize_sents(strings: List[str]) Iterator[List[Tuple[int, int]]][source]

Apply self.span_tokenize() to each element of strings. I.e.:

return [self.span_tokenize(s) for s in strings]

Yield

List[Tuple[int, int]]

Parameters

strings (List[str]) –

Return type

Iterator[List[Tuple[int, int]]]

tag(sentence)[source]

Tag a list of tokens.

Return type

list(tuple(str, str))

>>> parser = CoreNLPParser(url='http://localhost:9000', tagtype='ner')
>>> tokens = 'Rami Eid is studying at Stony Brook University in NY'.split()
>>> parser.tag(tokens)
[('Rami', 'PERSON'), ('Eid', 'PERSON'), ('is', 'O'), ('studying', 'O'), ('at', 'O'), ('Stony', 'ORGANIZATION'),
('Brook', 'ORGANIZATION'), ('University', 'ORGANIZATION'), ('in', 'O'), ('NY', 'O')]
>>> parser = CoreNLPParser(url='http://localhost:9000', tagtype='pos')
>>> tokens = "What is the airspeed of an unladen swallow ?".split()
>>> parser.tag(tokens)
[('What', 'WP'), ('is', 'VBZ'), ('the', 'DT'),
('airspeed', 'NN'), ('of', 'IN'), ('an', 'DT'),
('unladen', 'JJ'), ('swallow', 'VB'), ('?', '.')]
tag_sents(sentences)[source]

Tag multiple sentences.

Takes multiple sentences as a list where each sentence is a list of tokens.

Parameters

sentences (list(list(str))) – Input sentences to tag

Return type

list(list(tuple(str, str))

tokenize(text, properties=None)[source]

Tokenize a string of text.

>>> parser = CoreNLPParser(url='http://localhost:9000')
>>> text = 'Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.\nThanks.'
>>> list(parser.tokenize(text))
['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York', '.', 'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.']
>>> s = "The colour of the wall is blue."
>>> list(
...     parser.tokenize(
...         'The colour of the wall is blue.',
...             properties={'tokenize.options': 'americanize=true'},
...     )
... )
['The', 'color', 'of', 'the', 'wall', 'is', 'blue', '.']
tokenize_sents(strings: List[str]) List[List[str]][source]

Apply self.tokenize() to each element of strings. I.e.:

return [self.tokenize(s) for s in strings]

Return type

List[List[str]]

Parameters

strings (List[str]) –