Gazetteers
import os
from gatenlp import Document
from gatenlp.processing.gazetteer import TokenGazetteer
from gatenlp.processing.tokenizer import NLTKTokenizer
# all the example files will be created in "./tmp"
if not os.path.exists("tmp"):
os.mkdir("tmp")
# 1) Create a gazetteer from a Python list
gazlist = [
("Barack Obama", dict(url="https://en.wikipedia.org/wiki/Barack_Obama")),
("Obama", dict(url="https://en.wikipedia.org/wiki/Barack_Obama")),
("Donald Trump", dict(url="https://en.wikipedia.org/wiki/Donald_Trump")),
("Trump", dict(url="https://en.wikipedia.org/wiki/Donald_Trump")),
("George W. Bush", dict(url="https://en.wikipedia.org/wiki/George_W._Bush")),
("George Bush", dict(url="https://en.wikipedia.org/wiki/George_W._Bush")),
("Bush", dict(url="https://en.wikipedia.org/wiki/George_W._Bush")),
("Bill Clinton", dict(url="https://en.wikipedia.org/wiki/Bill_Clinton")),
("Clinton", dict(url="https://en.wikipedia.org/wiki/Bill_Clinton")),
]
# Document with some text mentioning some of the names
text = """Barack Obama was the 44th president of the US and he followed George W. Bush and
was followed by Donald Trump. Before Bush, Bill Clinton was president."""
doc = Document(text)
doc
# Tokenize the document, lets use an NLTK tokenizer
from nltk.tokenize.destructive import NLTKWordTokenizer
tokenizer = NLTKTokenizer(nltk_tokenizer=NLTKWordTokenizer(), out_set="", token_type="Token")
doc = tokenizer(doc)
doc
# Tokenize the strings from our gazetteer list as well
def text2tokenstrings(text):
tmpdoc = Document(text)
tokenizer(tmpdoc)
tokens = list(tmpdoc.annset().with_type("Token"))
return [tmpdoc[tok] for tok in tokens]
gazlist = [(text2tokenstrings(txt), feats) for txt, feats in gazlist]
gazlist
[(['Barack', 'Obama'], {'url': 'https://en.wikipedia.org/wiki/Barack_Obama'}),
(['Obama'], {'url': 'https://en.wikipedia.org/wiki/Barack_Obama'}),
(['Donald', 'Trump'], {'url': 'https://en.wikipedia.org/wiki/Donald_Trump'}),
(['Trump'], {'url': 'https://en.wikipedia.org/wiki/Donald_Trump'}),
(['George', 'W.', 'Bush'],
{'url': 'https://en.wikipedia.org/wiki/George_W._Bush'}),
(['George', 'Bush'], {'url': 'https://en.wikipedia.org/wiki/George_W._Bush'}),
(['Bush'], {'url': 'https://en.wikipedia.org/wiki/George_W._Bush'}),
(['Bill', 'Clinton'], {'url': 'https://en.wikipedia.org/wiki/Bill_Clinton'}),
(['Clinton'], {'url': 'https://en.wikipedia.org/wiki/Bill_Clinton'})]
# Create the gazetter and apply it to the document
gazetteer = TokenGazetteer(gazlist, fmt="gazlist", all=True, skip=False, outset="", outtype="Lookup",
annset="", tokentype="Token")
doc = gazetteer(doc)
doc