NTerminal’s Natural Language Processing (NLP) Module allows for keyword analysis, context lookup, and event drill down functionality across traditional media and social data sources.
The natural language data NTerminal aggregates and analyzes includes data from traditional media sources (e.g. New York Times articles), social media (e.g. Twitter & Reddit), messenger channels, tech blogs, Github profiles and the meeting minutes and decisions of financial regulators around the world (for example we have every decision from the Securities and Exchange Commission since 1992). Our NLP modules auto-analyze natural language data and extracts named entities like companies, locations, person names, and other information. For instance, NTerminal can parse litigation releases and see the individuals or companies under investigation. It can also send information to other processors, such as our AI module, which finds patterns, simulates the behavior of market participants, and predicts future outcomes. In the example above, information extracted by our NLP module on parties involved in rulemaking cases can be used to predict whether future rule change applications will be approved or denied.
You can read more about how our NLP Module leverages sentiment analysis, named entity extraction, and machine vision by following the following links:
Field | Description |
---|---|
time | Timestamp in UTC time zone. |
author | Article, post or tweet author. |
context | Fragment of text containing the keyword. |
decision | SEC decision. |
document_stats (stats_lines , stats_pages , stats_size , stats_words ) |
Document statistics. |
document_url | Document URL. |
event_source | Event source: ‘SEC’, ‘EDGAR’, ‘RSS’ etc. |
event_type | Event type: ‘keyword’. |
hashtags | Hashtags retrieved from the tweet. |
keyword_category | Keyword category. |
keyword_description | Keyword description. |
keyword_label | Entity label the keyword refers to. |
keyword_subcategory | Keyword subcategory. |
links | Hyperlinks retrieved from the text. |
match | Keyword found in text; literal match. |
match_pos | Where the match occurs: ‘release text’, ‘title’, ‘document’. |
media | Image annotation and image text analysis of user media. |
named_entities (person_name_candidates , organization_name_candidates , location_name_candidates ) |
List of named entities extracted from the document. |
related_documents | List of related PDF documents. |
release_number | SEC Litigation release number. |
respondents | List of litigation respondents: persons and organizations. |
rule_names | List of SEC rules the document is related to. |
source_category | Source category. |
source_subcategory | Event source subcategory. |
source_url | URL where the document was published. |
time | Timestamp in UTC time zone. |
title | Document title. |
user_mentions | Twitter users mentioned in the tweet. |
{
"time": "2018-08-16T00:00:00Z",
"event_source": "SEC",
"source_category": "Litigation_Releases",
"source_url": "https://www.sec.gov/litigation/litreleases.shtml"
"event_type": "keyword",
"keyword_label": "Robert A. Cohen",
"keyword_category": "person",
"keyword_subcategory": "",
"keyword_description": "SEC - Enforcement Division - Cyber Unit",
"match": "Robert A. Cohen",
"match_pos": "release text",
"context": "The SEC's investigation has been conducted by William Max Hathaway, Colby A. Steele, Patrick McCluskey, and Carolyn M. Welshhans in the Enforcement Division's Market Abuse Unit. The case has been supervised by Joseph G. Sansone, Chief of the Market Abuse Unit, and Robert A. Cohen. The litigation is being led by Melissa Armstrong and Cheryl Crumpton.",
"document_url": "https://www.sec.gov/litigation/litreleases/2018/lr24236.htm",
"release_number": "LR-24236",
"person_name_candidates": [],
"organization_name_candidates": [],
"location_name_candidates": [],
"respondents": [
"Dorothy Zarsky",
"Lauren Zarksy"
],
"related_documents": [],
"links": ["https://www.sec.gov/litigation/litreleases/2018/lr24231.htm"]
}
{
"time": "2018-08-07T19:25:44",
"event_source": "SEC",
"source_category": "Rulemaking",
"source_subcategory": "CboeBZX",
"source_url": "https://www.sec.gov/rules/sro/cboebzx.htm",
"event_type": "keyword",
"keyword_label": "Bitcoin",
"keyword_category": "currency",
"keyword_subcategory": "name",
"keyword_description": "",
"match": "Bitcoin",
"match_pos": "document",
"context": "Act of 1934 (“Act”)1 and Rule 19b-4 thereunder,2 a proposed rule change to list and trade shares of SolidX Bitcoin Shares issued by the VanEck SolidX Bitcoin Trust, under BZX Rule 14.11(e)(4), Commodity-Based Trust Shares. The proposed rule change was published for",
"title": "Notice of Designation of a Longer Period for Commission Action on a Proposed Rule Change to List and Trade Shares of SolidX Bitcoin Shares Issued by the VanEck SolidX Bitcoin Trust",
"document_url": "https://www.sec.gov/rules/sro/cboebzx/2018/34-83792.pdf",
"stats_lines": 47,
"stats_pages": 2,
"stats_size": "83.79 KB",
"stats_words": 433,
"organization_name_candidates": [
"Commission",
"Longer Period for Commission Action",
"Securities and Exchange Commission",
"Cboe BZX Exchange , Inc.",
"SolidX Bitcoin Shares",
"VanEck SolidX Bitcoin Trust"
],
"person_name_candidates": ["Eduardo A. Aleman"],
"location_name_candidates": [],
"rule_names": ["SR-CboeBZX-2018-040"],
"decision": ["decision", "delay", "longer_period"],
"related_documents": [],
"links": []
}
{
"type": "object",
"title": "Natural Language Content",
"properties": {
"header": {
"type": "object",
"required": [
"category",
"subcategory"
],
"properties": {
"author": {
"type": "object",
"properties": {
"full_name": {
"type": "string"
},
"id": {
"type": "string"
},
"aliases": {
"type": "array",
"items": {
"type": "string"
}
}
}
},
"category": {
"type": "string"
},
"subcategory": {
"type": "string"
},
"title": {
"type": "string"
}
}
},
"message_type": {
"type": "string",
"enum": [
"document",
"event",
"chat message",
"agent_data"
]
},
"body": {
"anyOf": [
{
"$ref": "#/definitions/nlp_document"
},
{
"$ref": "#/definitions/nlp_event"
},
{
"$ref": "#/definitions/nlp_chat_message"
},
{
"$ref": "#/definitions/nlp_agent"
}
]
}
},
"required": [
"header",
"message_type"
]
}
{
"type": "object",
"title": "Natural Language Document",
"properties": {
"canonical_keywords": {
"type": "array",
"items": {
"type": "string"
}
},
"stats": {
"type": "object",
"properties": {
"size": {
"type": "integer"
},
"pages": {
"type": "integer"
},
"words": {
"type": "integer"
},
"lines": {
"type": "integer"
}
}
},
"tags": {
"type": "array",
"items": {
"type": "string"
}
},
"named_entities": {
"type": "object",
"properties": {
"persons": {
"type": "array",
"items": {
"type": "string"
}
},
"organizations": {
"type": "array",
"items": {
"type": "string"
}
},
"locations": {
"type": "array",
"items": {
"type": "string"
}
}
}
},
"extracted": {
"type": "object",
"properties": {
"links": {
"type": "array",
"format": "uri",
"items": {
"type": "string"
}
},
"hashtags": {
"type": "array",
"items": {
"type": "string"
}
},
"user_mentions": {
"type": "array",
"items": {
"type": "string"
}
},
"image_annotations": {
"type": "array",
"items": {
"type": "string"
}
},
"document_timestamp": {
"type": "string",
"format": "date-time"
}
}
},
"sentiment": {
"type": "object",
"properties": {
"google": {
"type": "object",
"properties": {
"score": {
"type": "number",
"format": "float"
},
"magnitude": {
"type": "number",
"format": "float"
}
}
},
"ibm": {
"type": "object",
"properties": {
"score": {
"type": "number",
"format": "float"
}
}
}
}
},
"content": {
"type": "string"
}
}
}
{
"type": "object",
"title": "Natural Language Event",
"properties": {
"trigger": {
"type": "string",
"enum": [
"keyword",
"address",
"chat"
]
},
"keyword": {
"type": "object",
"properties": {
"value": {
"type": "string"
},
"canonical": {
"type": "string"
},
"symbol": {
"type": "string"
},
"location": {
"type": "string",
"enum": [
"document",
"external",
"tag",
"title",
"image"
]
},
"category": {
"type": "string"
},
"subcategory": {
"type": "string"
}
}
},
"context": {
"type": "string"
},
"sentiment": {
"type": "object",
"properties": {
"google": {
"type": "object",
"properties": {
"phrase": {
"type": "object",
"properties": {
"score": {
"type": "number",
"format": "float"
},
"magnitude": {
"type": "number",
"format": "float"
}
}
},
"sentence": {
"type": "object",
"properties": {
"score": {
"type": "number",
"format": "float"
},
"magnitude": {
"type": "number",
"format": "float"
}
}
}
}
},
"ibm": {
"type": "object",
"properties": {
"score": {
"type": "number",
"format": "float"
}
}
}
}
}
},
"required": [
"trigger",
"context"
]
}
{
"type": "object",
"title": "NLP Chat Message",
"properties": {
"id": {
"type": "string",
"description": "Message ID"
},
"date": {
"type": "string",
"description": "Message timestamp, ISO date in UTC"
},
"source": {
"type": "string",
"description": "Predefined string, the same for all messages",
"default": "Telegram"
},
"category": {
"type": "string",
"description": "Channel title; we should be able to pass it as parameter from configs"
},
"channel_id": {
"type": "string",
"description": "Channel id"
},
"author": {
"type": "string",
"description": "Message sender id"
},
"reciever": {
"type": "string",
"description": "Message reciever id"
},
"content": {
"type": "string",
"description": "Message text"
},
"related_documents": {
"type": "array",
"description": "messageMediaDocument",
"items": {
"type": "object",
"properties": {
"content": {
"type": "string",
"description": "Document text"
},
"date": {
"type": "string",
"description": "Document _creation_ date, if available; ISO date in UTC"
},
"size": {
"type": "integer",
"description": "File size in bytes"
},
"file_name": {
"type": "string"
}
}
}
},
"media": {
"type": "array",
"items": {
"type": "object",
"properties": {
"content": {
"type": "string",
"description": "messageMediaPhoto binary content or messageMediaVideo.thumb"
},
"description": {
"type": "string",
"description": "Media caption"
},
"date": {
"type": "string",
"description": "Media _creation_ date, if available; ISO date in UTC"
},
"type": {
"type": "string",
"description": "Media type: [image|video]"
},
"size": {
"type": "integer",
"description": "File size in bytes"
}
}
}
}
},
"required": [
"id",
"date",
"source",
"category",
"author",
"content"
]
}