"""Tree-structured data.
Including:
- Stanford Sentiment Treebank
"""
from __future__ import absolute_import
import os
from collections import OrderedDict
import networkx as nx
import numpy as np
from .. import backend as F
from ..convert import from_networkx
from .dgl_dataset import DGLBuiltinDataset
from .utils import (
_get_dgl_url,
deprecate_property,
load_graphs,
load_info,
save_graphs,
save_info,
)
__all__ = ["SST", "SSTDataset"]
[docs]class SSTDataset(DGLBuiltinDataset):
r"""Stanford Sentiment Treebank dataset.
Each sample is the constituency tree of a sentence. The leaf nodes
represent words. The word is a int value stored in the ``x`` feature field.
The non-leaf node has a special value ``PAD_WORD`` in the ``x`` field.
Each node also has a sentiment annotation: 5 classes (very negative,
negative, neutral, positive and very positive). The sentiment label is a
int value stored in the ``y`` feature field.
Official site: `<http://nlp.stanford.edu/sentiment/index.html>`_
Statistics:
- Train examples: 8,544
- Dev examples: 1,101
- Test examples: 2,210
- Number of classes for each node: 5
Parameters
----------
mode : str, optional
Should be one of ['train', 'dev', 'test', 'tiny']
Default: train
glove_embed_file : str, optional
The path to pretrained glove embedding file.
Default: None
vocab_file : str, optional
Optional vocabulary file. If not given, the default vacabulary file is used.
Default: None
raw_dir : str
Raw file directory to download/contains the input data directory.
Default: ~/.dgl/
force_reload : bool
Whether to reload the dataset. Default: False
verbose : bool
Whether to print out progress information. Default: True.
transform : callable, optional
A transform that takes in a :class:`~dgl.DGLGraph` object and returns
a transformed version. The :class:`~dgl.DGLGraph` object will be
transformed before every access.
Attributes
----------
vocab : OrderedDict
Vocabulary of the dataset
num_classes : int
Number of classes for each node
pretrained_emb: Tensor
Pretrained glove embedding with respect the vocabulary.
vocab_size : int
The size of the vocabulary
Notes
-----
All the samples will be loaded and preprocessed in the memory first.
Examples
--------
>>> # get dataset
>>> train_data = SSTDataset()
>>> dev_data = SSTDataset(mode='dev')
>>> test_data = SSTDataset(mode='test')
>>> tiny_data = SSTDataset(mode='tiny')
>>>
>>> len(train_data)
8544
>>> train_data.num_classes
5
>>> glove_embed = train_data.pretrained_emb
>>> train_data.vocab_size
19536
>>> train_data[0]
Graph(num_nodes=71, num_edges=70,
ndata_schemes={'x': Scheme(shape=(), dtype=torch.int64), 'y': Scheme(shape=(), dtype=torch.int64), 'mask': Scheme(shape=(), dtype=torch.int64)}
edata_schemes={})
>>> for tree in train_data:
... input_ids = tree.ndata['x']
... labels = tree.ndata['y']
... mask = tree.ndata['mask']
... # your code here
"""
PAD_WORD = -1 # special pad word id
UNK_WORD = -1 # out-of-vocabulary word id
def __init__(
self,
mode="train",
glove_embed_file=None,
vocab_file=None,
raw_dir=None,
force_reload=False,
verbose=False,
transform=None,
):
assert mode in ["train", "dev", "test", "tiny"]
_url = _get_dgl_url("dataset/sst.zip")
self._glove_embed_file = glove_embed_file if mode == "train" else None
self.mode = mode
self._vocab_file = vocab_file
super(SSTDataset, self).__init__(
name="sst",
url=_url,
raw_dir=raw_dir,
force_reload=force_reload,
verbose=verbose,
transform=transform,
)
def process(self):
from nltk.corpus.reader import BracketParseCorpusReader
# load vocab file
self._vocab = OrderedDict()
vocab_file = (
self._vocab_file
if self._vocab_file is not None
else os.path.join(self.raw_path, "vocab.txt")
)
with open(vocab_file, encoding="utf-8") as vf:
for line in vf.readlines():
line = line.strip()
self._vocab[line] = len(self._vocab)
# filter glove
if self._glove_embed_file is not None and os.path.exists(
self._glove_embed_file
):
glove_emb = {}
with open(self._glove_embed_file, "r", encoding="utf-8") as pf:
for line in pf.readlines():
sp = line.split(" ")
if sp[0].lower() in self._vocab:
glove_emb[sp[0].lower()] = np.asarray(
[float(x) for x in sp[1:]]
)
files = ["{}.txt".format(self.mode)]
corpus = BracketParseCorpusReader(self.raw_path, files)
sents = corpus.parsed_sents(files[0])
# initialize with glove
pretrained_emb = []
fail_cnt = 0
for line in self._vocab.keys():
if self._glove_embed_file is not None and os.path.exists(
self._glove_embed_file
):
if not line.lower() in glove_emb:
fail_cnt += 1
pretrained_emb.append(
glove_emb.get(
line.lower(), np.random.uniform(-0.05, 0.05, 300)
)
)
self._pretrained_emb = None
if self._glove_embed_file is not None and os.path.exists(
self._glove_embed_file
):
self._pretrained_emb = F.tensor(np.stack(pretrained_emb, 0))
print(
"Miss word in GloVe {0:.4f}".format(
1.0 * fail_cnt / len(self._pretrained_emb)
)
)
# build trees
self._trees = []
for sent in sents:
self._trees.append(self._build_tree(sent))
def _build_tree(self, root):
g = nx.DiGraph()
def _rec_build(nid, node):
for child in node:
cid = g.number_of_nodes()
if isinstance(child[0], str) or isinstance(child[0], bytes):
# leaf node
word = self.vocab.get(child[0].lower(), self.UNK_WORD)
g.add_node(cid, x=word, y=int(child.label()), mask=1)
else:
g.add_node(
cid, x=SSTDataset.PAD_WORD, y=int(child.label()), mask=0
)
_rec_build(cid, child)
g.add_edge(cid, nid)
# add root
g.add_node(0, x=SSTDataset.PAD_WORD, y=int(root.label()), mask=0)
_rec_build(0, root)
ret = from_networkx(g, node_attrs=["x", "y", "mask"])
return ret
@property
def graph_path(self):
return os.path.join(self.save_path, self.mode + "_dgl_graph.bin")
@property
def vocab_path(self):
return os.path.join(self.save_path, "vocab.pkl")
def has_cache(self):
return os.path.exists(self.graph_path) and os.path.exists(
self.vocab_path
)
def save(self):
save_graphs(self.graph_path, self._trees)
save_info(self.vocab_path, {"vocab": self.vocab})
if self.pretrained_emb:
emb_path = os.path.join(self.save_path, "emb.pkl")
save_info(emb_path, {"embed": self.pretrained_emb})
def load(self):
emb_path = os.path.join(self.save_path, "emb.pkl")
self._trees = load_graphs(self.graph_path)[0]
self._vocab = load_info(self.vocab_path)["vocab"]
self._pretrained_emb = None
if os.path.exists(emb_path):
self._pretrained_emb = load_info(emb_path)["embed"]
@property
def vocab(self):
r"""Vocabulary
Returns
-------
OrderedDict
"""
return self._vocab
@property
def pretrained_emb(self):
r"""Pre-trained word embedding, if given."""
return self._pretrained_emb
[docs] def __getitem__(self, idx):
r"""Get graph by index
Parameters
----------
idx : int
Returns
-------
:class:`dgl.DGLGraph`
graph structure, word id for each node, node labels and masks.
- ``ndata['x']``: word id of the node
- ``ndata['y']:`` label of the node
- ``ndata['mask']``: 1 if the node is a leaf, otherwise 0
"""
if self._transform is None:
return self._trees[idx]
else:
return self._transform(self._trees[idx])
[docs] def __len__(self):
r"""Number of graphs in the dataset."""
return len(self._trees)
@property
def vocab_size(self):
r"""Vocabulary size."""
return len(self._vocab)
@property
def num_classes(self):
r"""Number of classes for each node."""
return 5
SST = SSTDataset