Upload folder using huggingface_hub
Browse files- README.md +46 -0
- build/lib/headline_gen/Control.py +434 -0
- build/lib/headline_gen/__init__.py +0 -0
- dist/headline-gen-2.3.tar.gz +3 -0
- dist/headline-gen-2.4.tar.gz +3 -0
- dist/headline-gen-2.5.tar.gz +3 -0
- dist/headline-gen-2.6.tar.gz +3 -0
- dist/headline_gen-2.3-py3-none-any.whl +0 -0
- dist/headline_gen-2.4-py3-none-any.whl +0 -0
- dist/headline_gen-2.5-py3-none-any.whl +0 -0
- dist/headline_gen-2.6-py3-none-any.whl +0 -0
- headline_gen.egg-info/PKG-INFO +55 -0
- headline_gen.egg-info/SOURCES.txt +9 -0
- headline_gen.egg-info/dependency_links.txt +1 -0
- headline_gen.egg-info/requires.txt +10 -0
- headline_gen.egg-info/top_level.txt +1 -0
- headline_gen/Control.py +434 -0
- headline_gen/__init__.py +0 -0
- setup.py +24 -0
README.md
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
# Headline Generation Package
|
| 3 |
+
|
| 4 |
+
This is a Python package for generating headlines from Articles.
|
| 5 |
+
|
| 6 |
+
## Installation
|
| 7 |
+
|
| 8 |
+
You can install the package using pip:
|
| 9 |
+
|
| 10 |
+
```bash
|
| 11 |
+
pip install headline-gen
|
| 12 |
+
```
|
| 13 |
+
|
| 14 |
+
## Usage
|
| 15 |
+
|
| 16 |
+
```python
|
| 17 |
+
from headline_gen.Control import ServerCntrl, Generate
|
| 18 |
+
|
| 19 |
+
# Run this once to start the server
|
| 20 |
+
Server = ServerCntrl("Start")
|
| 21 |
+
|
| 22 |
+
# Generate headline from article text
|
| 23 |
+
headline = Generate("Your article text goes here...", Server)
|
| 24 |
+
print(headline)
|
| 25 |
+
|
| 26 |
+
# Stop the server when done
|
| 27 |
+
ServerCntrl("Stop", Server)
|
| 28 |
+
```
|
| 29 |
+
|
| 30 |
+
## Description
|
| 31 |
+
|
| 32 |
+
This package provides functionality to generate headlines from article text using natural language processing techniques.
|
| 33 |
+
|
| 34 |
+
## Usage Instructions
|
| 35 |
+
|
| 36 |
+
1. Import the `ServerCntrl` and `Generate` functions from the `Control` module.
|
| 37 |
+
2. Start the server using `ServerCntrl("Start")`. This only needs to be done once.
|
| 38 |
+
3. Generate headlines using the `Generate` function, passing the article text as an argument.
|
| 39 |
+
4. Stop the server when done using `ServerCntrl("Stop", Server)`.
|
| 40 |
+
|
| 41 |
+
## New Release Features (v2.6) and Bug Fixes
|
| 42 |
+
|
| 43 |
+
1. Fixed a corner case issue causing a ZeroDivisionError when processing irregular parameters for phrase extraction. The package now gracefully handles such scenarios without disrupting functionality.
|
| 44 |
+
2. Renamed the function `ServerInit` to `ServerCntrl` for improved clarity and consistency within the codebase.
|
| 45 |
+
3. Additionally, streamlined the dependency management by directly including `en_core_web_sm` in the downloader module.
|
| 46 |
+
4. Output made more Comprehensive.
|
build/lib/headline_gen/Control.py
ADDED
|
@@ -0,0 +1,434 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import requests
|
| 2 |
+
import zipfile
|
| 3 |
+
import os
|
| 4 |
+
import nltk
|
| 5 |
+
|
| 6 |
+
from nltk.parse.corenlp import CoreNLPServer
|
| 7 |
+
|
| 8 |
+
from spacy_download import load_spacy
|
| 9 |
+
import textacy
|
| 10 |
+
from textacy import *
|
| 11 |
+
import string
|
| 12 |
+
|
| 13 |
+
import re
|
| 14 |
+
import numpy as np
|
| 15 |
+
from nltk.tokenize import sent_tokenize
|
| 16 |
+
from nltk.corpus import stopwords
|
| 17 |
+
from gensim.models import Word2Vec
|
| 18 |
+
from scipy.spatial import distance
|
| 19 |
+
import networkx as nx
|
| 20 |
+
|
| 21 |
+
#import string
|
| 22 |
+
from nltk.parse.corenlp import CoreNLPParser
|
| 23 |
+
from nltk.tree.tree import Tree
|
| 24 |
+
|
| 25 |
+
import torch
|
| 26 |
+
import numpy as np
|
| 27 |
+
from transformers import DistilBertTokenizerFast, DistilBertForTokenClassification
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
#import re
|
| 31 |
+
#import os
|
| 32 |
+
|
| 33 |
+
# Downloader
|
| 34 |
+
|
| 35 |
+
def Downloader():
|
| 36 |
+
directory_path = "Parser/stanford-corenlp-4.5.6"
|
| 37 |
+
if os.path.exists(directory_path) and os.listdir(directory_path):
|
| 38 |
+
pass
|
| 39 |
+
else:
|
| 40 |
+
nlp = load_spacy("en_core_web_sm")
|
| 41 |
+
nltk.download('punkt')
|
| 42 |
+
nltk.download('stopwords')
|
| 43 |
+
url = "https://nlp.stanford.edu/software/stanford-corenlp-4.5.6.zip"
|
| 44 |
+
|
| 45 |
+
filename = "stanford-corenlp-4.5.6.zip"
|
| 46 |
+
directory = "./Parser/"
|
| 47 |
+
|
| 48 |
+
os.makedirs(directory, exist_ok=True)
|
| 49 |
+
|
| 50 |
+
response = requests.get(url)
|
| 51 |
+
|
| 52 |
+
if response.status_code == 200:
|
| 53 |
+
with open(os.path.join(directory, filename), 'wb') as f:
|
| 54 |
+
f.write(response.content)
|
| 55 |
+
print("Download successful.")
|
| 56 |
+
|
| 57 |
+
with zipfile.ZipFile(os.path.join(directory, filename), 'r') as zip_ref:
|
| 58 |
+
zip_ref.extractall(directory)
|
| 59 |
+
print("Extraction successful.")
|
| 60 |
+
else:
|
| 61 |
+
print("Failed to download file.")
|
| 62 |
+
|
| 63 |
+
def ServerCntrl(Mode, Server = None):
|
| 64 |
+
Path = 'Parser/'
|
| 65 |
+
os.environ['CLASSPATH'] = os.path.join(Path, 'stanford-corenlp-4.5.6')
|
| 66 |
+
|
| 67 |
+
directory_path = "Parser/stanford-corenlp-4.5.6"
|
| 68 |
+
if os.path.exists(directory_path) and os.listdir(directory_path):
|
| 69 |
+
if Mode == "Start":
|
| 70 |
+
server = CoreNLPServer()
|
| 71 |
+
server.start()
|
| 72 |
+
return server
|
| 73 |
+
elif Mode == "Stop":
|
| 74 |
+
if Server is None:
|
| 75 |
+
print("No Server Object Provided")
|
| 76 |
+
else:
|
| 77 |
+
Server.stop()
|
| 78 |
+
else:
|
| 79 |
+
print("Un defined Operation")
|
| 80 |
+
else:
|
| 81 |
+
print("Parser Files Not Found")
|
| 82 |
+
print("Attempting to Install Parser Files (This may take a Min or Two!!)")
|
| 83 |
+
Downloader()
|
| 84 |
+
if Mode == "Start":
|
| 85 |
+
server = ServerCntrl("Start")
|
| 86 |
+
return server
|
| 87 |
+
|
| 88 |
+
# Key Phrase Extraction
|
| 89 |
+
|
| 90 |
+
def remove_punctuation(text):
|
| 91 |
+
table = str.maketrans('', '', string.punctuation)
|
| 92 |
+
return text.translate(table)
|
| 93 |
+
|
| 94 |
+
def KeyPhraseSGRank(Article):
|
| 95 |
+
en = textacy.load_spacy_lang("en_core_web_sm")
|
| 96 |
+
|
| 97 |
+
Article = remove_punctuation(Article)
|
| 98 |
+
|
| 99 |
+
doc = textacy.make_spacy_doc(Article, lang=en)
|
| 100 |
+
|
| 101 |
+
TopPhrases = [kps for kps, weights in textacy.extract.keyterms.sgrank(doc, ngrams = (1, 3), topn=1.0)]
|
| 102 |
+
if len(TopPhrases) != 0:
|
| 103 |
+
print("...Key Phrases Found...")
|
| 104 |
+
print(TopPhrases)
|
| 105 |
+
return TopPhrases
|
| 106 |
+
else:
|
| 107 |
+
print("No Specific Key Phrases Found, Terminating the Execution...")
|
| 108 |
+
exit()
|
| 109 |
+
|
| 110 |
+
# Lead Sentence Extraction
|
| 111 |
+
|
| 112 |
+
class LeadSentencesOOPS:
|
| 113 |
+
def __init__(self, df):
|
| 114 |
+
self.df = df
|
| 115 |
+
self.sentences = sent_tokenize(self.df)
|
| 116 |
+
|
| 117 |
+
def pre_process(self):
|
| 118 |
+
sentences_clean = [re.sub(r'[^\w\s]', '', sentence.lower()) for sentence in self.sentences]
|
| 119 |
+
stop_words = stopwords.words('english')
|
| 120 |
+
sentence_tokens = [[words for words in sentence.split(' ') if words not in stop_words] for sentence in sentences_clean]
|
| 121 |
+
return sentence_tokens
|
| 122 |
+
|
| 123 |
+
def count_paragraphs(self):
|
| 124 |
+
text=self.df
|
| 125 |
+
paragraphs = re.split(r'\n\s*\n', text)
|
| 126 |
+
return (paragraphs,len(paragraphs))
|
| 127 |
+
|
| 128 |
+
def word2vec(self):
|
| 129 |
+
sentence_tokens = self.pre_process()
|
| 130 |
+
w2v = Word2Vec(sentence_tokens, vector_size=1, min_count=1, epochs=1500)
|
| 131 |
+
sentence_embeddings = []
|
| 132 |
+
max_len = max(len(tokens) for tokens in sentence_tokens)
|
| 133 |
+
for words in sentence_tokens:
|
| 134 |
+
embedding = [w2v.wv[word] for word in words]
|
| 135 |
+
padding_length = max_len - len(embedding)
|
| 136 |
+
padded_embedding = np.pad(embedding, [(0, padding_length), (0, 0)], mode='constant')
|
| 137 |
+
sentence_embeddings.append(padded_embedding)
|
| 138 |
+
return sentence_embeddings
|
| 139 |
+
|
| 140 |
+
def similarity_matrix(self):
|
| 141 |
+
sentence_tokens = self.pre_process()
|
| 142 |
+
sentence_embeddings = self.word2vec()
|
| 143 |
+
similarity_matrix = np.zeros([len(sentence_tokens), len(sentence_tokens)])
|
| 144 |
+
for i, row_embedding in enumerate(sentence_embeddings):
|
| 145 |
+
for j, column_embedding in enumerate(sentence_embeddings):
|
| 146 |
+
similarity_matrix[i][j] = 1 - distance.cosine(row_embedding.ravel(), column_embedding.ravel())
|
| 147 |
+
return similarity_matrix
|
| 148 |
+
|
| 149 |
+
def num_of_leadingsentences(self):
|
| 150 |
+
num_sentences = len(self.sentences)
|
| 151 |
+
if num_sentences < 5:
|
| 152 |
+
top = 1
|
| 153 |
+
elif num_sentences < 10:
|
| 154 |
+
top = 2
|
| 155 |
+
elif num_sentences < 25:
|
| 156 |
+
top = 4
|
| 157 |
+
elif num_sentences < 50:
|
| 158 |
+
top = 9
|
| 159 |
+
elif num_sentences < 100:
|
| 160 |
+
top = 18
|
| 161 |
+
elif num_sentences < 200:
|
| 162 |
+
top = 25
|
| 163 |
+
elif num_sentences >= 201:
|
| 164 |
+
top = 40
|
| 165 |
+
return top
|
| 166 |
+
|
| 167 |
+
def text_rank(self,num_sentences_to_extract):
|
| 168 |
+
li=[]
|
| 169 |
+
similarity_matrixs = self.similarity_matrix()
|
| 170 |
+
nx_graph = nx.from_numpy_array(similarity_matrixs)
|
| 171 |
+
scores = nx.pagerank(nx_graph)
|
| 172 |
+
top_sentence = {sentence: scores[index] for index, sentence in enumerate(self.sentences)}
|
| 173 |
+
top = dict(sorted(top_sentence.items(), key=lambda x: x[1], reverse=True)[:num_sentences_to_extract])
|
| 174 |
+
for sent in self.sentences:
|
| 175 |
+
if sent in top.keys():
|
| 176 |
+
li.append(sent)
|
| 177 |
+
return li
|
| 178 |
+
|
| 179 |
+
def leading_sentences(self):
|
| 180 |
+
article_info = self.count_paragraphs()
|
| 181 |
+
leading_sentences=[]
|
| 182 |
+
#if there is only one para in article then num_of_leading sentences are selected based on fixed constant
|
| 183 |
+
if article_info[1] <= 3:
|
| 184 |
+
num_sentences_to_extract=self.num_of_leadingsentences()
|
| 185 |
+
LSG_article = LeadSentencesOOPS(str(article_info[0]))
|
| 186 |
+
leading_sentences.extend(LSG_article.text_rank(num_sentences_to_extract))
|
| 187 |
+
#leading_sentences_corpus.append(leading_sentences)
|
| 188 |
+
else:
|
| 189 |
+
num_sentences_to_extract=1 #if there are more than one paras in article
|
| 190 |
+
paragraphs = article_info[0]
|
| 191 |
+
#print("num_paras: ",paragraphs)
|
| 192 |
+
#extracting one leading sentence from each paragraph
|
| 193 |
+
for para in paragraphs:
|
| 194 |
+
LSG = LeadSentencesOOPS(para)
|
| 195 |
+
output = LSG.text_rank(num_sentences_to_extract)
|
| 196 |
+
leading_sentences.extend(output)
|
| 197 |
+
#extractig leading sentence from entire article
|
| 198 |
+
LSG_article = LeadSentencesOOPS(para)
|
| 199 |
+
leading_sentences.extend(LSG_article.text_rank(num_sentences_to_extract))
|
| 200 |
+
|
| 201 |
+
return leading_sentences
|
| 202 |
+
|
| 203 |
+
# Parsing and Compression Algo
|
| 204 |
+
|
| 205 |
+
def remove_punctuation(text):
|
| 206 |
+
table = str.maketrans('', '', string.punctuation)
|
| 207 |
+
return text.translate(table)
|
| 208 |
+
|
| 209 |
+
def Parsing(Sentence, Server):
|
| 210 |
+
parser = CoreNLPParser(url=Server.url)
|
| 211 |
+
return next(parser.raw_parse(Sentence))
|
| 212 |
+
|
| 213 |
+
def find_leftmost_S(tree):
|
| 214 |
+
if isinstance(tree, str): # Terminal node
|
| 215 |
+
return None
|
| 216 |
+
elif tree.label() == 'S': # Found leftmost S node
|
| 217 |
+
return tree
|
| 218 |
+
else:
|
| 219 |
+
for subtree in tree:
|
| 220 |
+
result = find_leftmost_S(subtree)
|
| 221 |
+
if result is not None:
|
| 222 |
+
return result
|
| 223 |
+
|
| 224 |
+
def Pruning(tree, Label):
|
| 225 |
+
if isinstance(tree, str):
|
| 226 |
+
return tree
|
| 227 |
+
if tree.height() > 0:
|
| 228 |
+
filtered_children = [Pruning(child, Label) for child in tree if (isinstance(child, str) or child.height() > 0) and (isinstance(child, str) or child.label() != Label)]
|
| 229 |
+
return Tree(tree.label(), filtered_children)
|
| 230 |
+
else:
|
| 231 |
+
return tree
|
| 232 |
+
|
| 233 |
+
def IterativeTrimming(HeadLine, SGRankList, Threshold):
|
| 234 |
+
if len(HeadLine) > Threshold:
|
| 235 |
+
if len(SGRankList) > 0:
|
| 236 |
+
ptr = SGRankList[-1]
|
| 237 |
+
else:
|
| 238 |
+
return HeadLine
|
| 239 |
+
if HeadLine.find(ptr) > 0:
|
| 240 |
+
if HeadLine[HeadLine.find(ptr) - 1] != ' ':
|
| 241 |
+
HeadLine = HeadLine.replace(ptr, ":", 1)
|
| 242 |
+
else:
|
| 243 |
+
HeadLine = HeadLine.replace(' ' + ptr, "", 1)
|
| 244 |
+
else:
|
| 245 |
+
HeadLine = HeadLine.replace(ptr + ' ', "", 1)
|
| 246 |
+
return IterativeTrimming(HeadLine, SGRankList[: len(SGRankList) - 1], Threshold)
|
| 247 |
+
else:
|
| 248 |
+
return HeadLine
|
| 249 |
+
|
| 250 |
+
def Extract(Treex):
|
| 251 |
+
k = Treex.leaves()
|
| 252 |
+
Trex = ''
|
| 253 |
+
for i in k:
|
| 254 |
+
Trex += i + ' '
|
| 255 |
+
return Trex
|
| 256 |
+
|
| 257 |
+
def CompressionAlgorithm(LeadSents, TopPhrases, server):
|
| 258 |
+
CompressedSentences = []
|
| 259 |
+
for i in LeadSents:
|
| 260 |
+
Suppy = remove_punctuation(i)
|
| 261 |
+
|
| 262 |
+
ParsedSentence = Parsing(Suppy, server)
|
| 263 |
+
|
| 264 |
+
for i in ParsedSentence:
|
| 265 |
+
for j in i:
|
| 266 |
+
lefts = find_leftmost_S(j)
|
| 267 |
+
if lefts is not None:
|
| 268 |
+
LeftMostS = lefts
|
| 269 |
+
else:
|
| 270 |
+
LeftMostS = i
|
| 271 |
+
break
|
| 272 |
+
|
| 273 |
+
Labels = [ 'SBAR', 'DT', 'TMP', 'CC']
|
| 274 |
+
for i in Labels:
|
| 275 |
+
Temp = Pruning(LeftMostS, i)
|
| 276 |
+
LeftMostS = Temp
|
| 277 |
+
|
| 278 |
+
Trex = Extract(Temp)
|
| 279 |
+
Kalix = IterativeTrimming(Trex, TopPhrases, 120)
|
| 280 |
+
|
| 281 |
+
'''PS = Parsing(Kalix, server)
|
| 282 |
+
Tk = Pruning(PS, 'SBAR')
|
| 283 |
+
|
| 284 |
+
Trex = Extract(Tk)'''
|
| 285 |
+
|
| 286 |
+
CompressedSentences.append(Kalix)
|
| 287 |
+
return CompressedSentences
|
| 288 |
+
|
| 289 |
+
# Key Phrase Matching and Ranking
|
| 290 |
+
|
| 291 |
+
def SGRMatching(HeadLine, TopPhrases):
|
| 292 |
+
l, Flag, itre = len(TopPhrases), 0.0, 0
|
| 293 |
+
for Phrase in TopPhrases:
|
| 294 |
+
if Phrase in HeadLine:
|
| 295 |
+
Flag += (l - TopPhrases.index(Phrase)) / l
|
| 296 |
+
itre += 1
|
| 297 |
+
return (itre * Flag) / l
|
| 298 |
+
'''
|
| 299 |
+
if itre != 0:
|
| 300 |
+
return Flag / itre
|
| 301 |
+
else:
|
| 302 |
+
return -1'''
|
| 303 |
+
|
| 304 |
+
def Ranking(CompressedSentences, KeyPhrases):
|
| 305 |
+
ResultDict = {}
|
| 306 |
+
for i in CompressedSentences:
|
| 307 |
+
ResultDict[i] = SGRMatching(i, KeyPhrases)
|
| 308 |
+
return ResultDict
|
| 309 |
+
|
| 310 |
+
# Post Processing using DistilBert
|
| 311 |
+
|
| 312 |
+
#
|
| 313 |
+
# Split text to segments of length 200, with overlap 50
|
| 314 |
+
#
|
| 315 |
+
def split_to_segments(wrds, length, overlap):
|
| 316 |
+
resp = []
|
| 317 |
+
i = 0
|
| 318 |
+
while True:
|
| 319 |
+
wrds_split = wrds[(length * i):((length * (i + 1)) + overlap)]
|
| 320 |
+
if not wrds_split:
|
| 321 |
+
break
|
| 322 |
+
|
| 323 |
+
resp_obj = {
|
| 324 |
+
"text": wrds_split,
|
| 325 |
+
"start_idx": length * i,
|
| 326 |
+
"end_idx": (length * (i + 1)) + overlap,
|
| 327 |
+
}
|
| 328 |
+
|
| 329 |
+
resp.append(resp_obj)
|
| 330 |
+
i += 1
|
| 331 |
+
return resp
|
| 332 |
+
|
| 333 |
+
|
| 334 |
+
#
|
| 335 |
+
# Punctuate wordpieces
|
| 336 |
+
#
|
| 337 |
+
def punctuate_wordpiece(wordpiece, label):
|
| 338 |
+
if label.startswith('UPPER'):
|
| 339 |
+
wordpiece = wordpiece.upper()
|
| 340 |
+
elif label.startswith('Upper'):
|
| 341 |
+
wordpiece = wordpiece[0].upper() + wordpiece[1:]
|
| 342 |
+
if label[-1] != '_' and label[-1] != wordpiece[-1]:
|
| 343 |
+
wordpiece += label[-1]
|
| 344 |
+
return wordpiece
|
| 345 |
+
|
| 346 |
+
|
| 347 |
+
#
|
| 348 |
+
# Punctuate text segments (200 words)
|
| 349 |
+
#
|
| 350 |
+
def punctuate_segment(wordpieces, word_ids, labels, start_word):
|
| 351 |
+
result = ''
|
| 352 |
+
for idx in range(0, len(wordpieces)):
|
| 353 |
+
if word_ids[idx] == None:
|
| 354 |
+
continue
|
| 355 |
+
if word_ids[idx] < start_word:
|
| 356 |
+
continue
|
| 357 |
+
wordpiece = punctuate_wordpiece(wordpieces[idx][2:] if wordpieces[idx].startswith('##') else wordpieces[idx],
|
| 358 |
+
labels[idx])
|
| 359 |
+
if idx > 0 and len(result) > 0 and word_ids[idx] != word_ids[idx - 1] and result[-1] != '-':
|
| 360 |
+
result += ' '
|
| 361 |
+
result += wordpiece
|
| 362 |
+
return result
|
| 363 |
+
|
| 364 |
+
|
| 365 |
+
#
|
| 366 |
+
# Tokenize, predict, punctuate text segments (200 words)
|
| 367 |
+
#
|
| 368 |
+
def process_segment(words, tokenizer, model, start_word, encoder_max_length):
|
| 369 |
+
|
| 370 |
+
tokens = tokenizer(words['text'],
|
| 371 |
+
padding="max_length",
|
| 372 |
+
# truncation=True,
|
| 373 |
+
max_length=encoder_max_length,
|
| 374 |
+
is_split_into_words=True, return_tensors='pt')
|
| 375 |
+
|
| 376 |
+
with torch.no_grad():
|
| 377 |
+
logits = model(**tokens).logits
|
| 378 |
+
logits = logits.cpu()
|
| 379 |
+
predictions = np.argmax(logits, axis=-1)
|
| 380 |
+
|
| 381 |
+
wordpieces = tokens.tokens()
|
| 382 |
+
word_ids = tokens.word_ids()
|
| 383 |
+
id2label = model.config.id2label
|
| 384 |
+
labels = [[id2label[p.item()] for p in prediction] for prediction in predictions][0]
|
| 385 |
+
|
| 386 |
+
return punctuate_segment(wordpieces, word_ids, labels, start_word)
|
| 387 |
+
|
| 388 |
+
|
| 389 |
+
#
|
| 390 |
+
# Punctuate text of any length
|
| 391 |
+
#
|
| 392 |
+
def punctuate(text, tokenizer, model, encoder_max_length):
|
| 393 |
+
text = text.lower()
|
| 394 |
+
text = text.replace('\n', ' ')
|
| 395 |
+
words = text.split(' ')
|
| 396 |
+
|
| 397 |
+
overlap = 50
|
| 398 |
+
slices = split_to_segments(words, 150, 50)
|
| 399 |
+
|
| 400 |
+
result = ""
|
| 401 |
+
start_word = 0
|
| 402 |
+
for text in slices:
|
| 403 |
+
corrected = process_segment(text, tokenizer, model, start_word, encoder_max_length)
|
| 404 |
+
result += corrected + ' '
|
| 405 |
+
start_word = overlap
|
| 406 |
+
return result
|
| 407 |
+
|
| 408 |
+
def PostProcess(Sentence):
|
| 409 |
+
checkpoint = "venkatchoudharyala/Punctuate"
|
| 410 |
+
tokenizer = DistilBertTokenizerFast.from_pretrained(checkpoint)
|
| 411 |
+
model = DistilBertForTokenClassification.from_pretrained(checkpoint)
|
| 412 |
+
encoder_max_length = 256
|
| 413 |
+
return punctuate(Sentence, tokenizer, model, encoder_max_length)
|
| 414 |
+
|
| 415 |
+
def Generate(Article, Server):
|
| 416 |
+
cleaned_article = re.sub(r'\([^)]*\)', '', Article)
|
| 417 |
+
|
| 418 |
+
KeyPhrases = KeyPhraseSGRank(cleaned_article)
|
| 419 |
+
|
| 420 |
+
LSG = LeadSentencesOOPS(cleaned_article)
|
| 421 |
+
LeadingSentences = LSG.leading_sentences()
|
| 422 |
+
#LeadingSentences = leading_sentences(cleaned_article)
|
| 423 |
+
#LeadingSentences = get_first_sentences(cleaned_article)
|
| 424 |
+
print("...Leading Sentences Found...")
|
| 425 |
+
print(LeadingSentences)
|
| 426 |
+
|
| 427 |
+
CompressedSentences = CompressionAlgorithm(LeadingSentences, KeyPhrases, Server)
|
| 428 |
+
|
| 429 |
+
ResultDict = Ranking(CompressedSentences, KeyPhrases)
|
| 430 |
+
|
| 431 |
+
max_key = max(ResultDict, key=lambda k: ResultDict[k])
|
| 432 |
+
print("...Scores of Sentences...")
|
| 433 |
+
print(ResultDict)
|
| 434 |
+
return PostProcess(max_key)
|
build/lib/headline_gen/__init__.py
ADDED
|
File without changes
|
dist/headline-gen-2.3.tar.gz
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f5a5397807c25ff55f93a51b2734174a1add5524a6f3f6cd8a192fcd9c94a3df
|
| 3 |
+
size 6278
|
dist/headline-gen-2.4.tar.gz
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a755616c0f279d974304a1f1ea6bafc1f8170e06ad29a0f64a491ee1e1cab752
|
| 3 |
+
size 6286
|
dist/headline-gen-2.5.tar.gz
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8b2bc4205713c78e56c31f6d0dce5db2e8a0074c37f2cfcf07f4da7b32b85ed5
|
| 3 |
+
size 6225
|
dist/headline-gen-2.6.tar.gz
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7cf39696ea27452e3b5efab02eaa8031cfec599cb8271990b8fe37c8ab1015a7
|
| 3 |
+
size 6249
|
dist/headline_gen-2.3-py3-none-any.whl
ADDED
|
Binary file (6.37 kB). View file
|
|
|
dist/headline_gen-2.4-py3-none-any.whl
ADDED
|
Binary file (6.37 kB). View file
|
|
|
dist/headline_gen-2.5-py3-none-any.whl
ADDED
|
Binary file (6.33 kB). View file
|
|
|
dist/headline_gen-2.6-py3-none-any.whl
ADDED
|
Binary file (6.33 kB). View file
|
|
|
headline_gen.egg-info/PKG-INFO
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Metadata-Version: 2.1
|
| 2 |
+
Name: headline-gen
|
| 3 |
+
Version: 2.6
|
| 4 |
+
Summary: Provides functionality to generate headlines from articles using natural language processing techniques.
|
| 5 |
+
Author: venkatchoudharyala
|
| 6 |
+
Author-email: venkatchoudhary.ala@gmail.com
|
| 7 |
+
Requires-Python: >=3.6
|
| 8 |
+
Description-Content-Type: text/markdown
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
# Headline Generation Package
|
| 12 |
+
|
| 13 |
+
This is a Python package for generating headlines from Articles.
|
| 14 |
+
|
| 15 |
+
## Installation
|
| 16 |
+
|
| 17 |
+
You can install the package using pip:
|
| 18 |
+
|
| 19 |
+
```bash
|
| 20 |
+
pip install headline-gen
|
| 21 |
+
```
|
| 22 |
+
|
| 23 |
+
## Usage
|
| 24 |
+
|
| 25 |
+
```python
|
| 26 |
+
from headline_gen.Control import ServerCntrl, Generate
|
| 27 |
+
|
| 28 |
+
# Run this once to start the server
|
| 29 |
+
Server = ServerCntrl("Start")
|
| 30 |
+
|
| 31 |
+
# Generate headline from article text
|
| 32 |
+
headline = Generate("Your article text goes here...", Server)
|
| 33 |
+
print(headline)
|
| 34 |
+
|
| 35 |
+
# Stop the server when done
|
| 36 |
+
ServerCntrl("Stop", Server)
|
| 37 |
+
```
|
| 38 |
+
|
| 39 |
+
## Description
|
| 40 |
+
|
| 41 |
+
This package provides functionality to generate headlines from article text using natural language processing techniques.
|
| 42 |
+
|
| 43 |
+
## Usage Instructions
|
| 44 |
+
|
| 45 |
+
1. Import the `ServerCntrl` and `Generate` functions from the `Control` module.
|
| 46 |
+
2. Start the server using `ServerCntrl("Start")`. This only needs to be done once.
|
| 47 |
+
3. Generate headlines using the `Generate` function, passing the article text as an argument.
|
| 48 |
+
4. Stop the server when done using `ServerCntrl("Stop", Server)`.
|
| 49 |
+
|
| 50 |
+
## New Release Features (v2.6) and Bug Fixes
|
| 51 |
+
|
| 52 |
+
1. Fixed a corner case issue causing a ZeroDivisionError when processing irregular parameters for phrase extraction. The package now gracefully handles such scenarios without disrupting functionality.
|
| 53 |
+
2. Renamed the function `ServerInit` to `ServerCntrl` for improved clarity and consistency within the codebase.
|
| 54 |
+
3. Additionally, streamlined the dependency management by directly including `en_core_web_sm` in the downloader module.
|
| 55 |
+
4. Output made more Comprehensive.
|
headline_gen.egg-info/SOURCES.txt
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
README.md
|
| 2 |
+
setup.py
|
| 3 |
+
headline_gen/Control.py
|
| 4 |
+
headline_gen/__init__.py
|
| 5 |
+
headline_gen.egg-info/PKG-INFO
|
| 6 |
+
headline_gen.egg-info/SOURCES.txt
|
| 7 |
+
headline_gen.egg-info/dependency_links.txt
|
| 8 |
+
headline_gen.egg-info/requires.txt
|
| 9 |
+
headline_gen.egg-info/top_level.txt
|
headline_gen.egg-info/dependency_links.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
|
headline_gen.egg-info/requires.txt
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
requests
|
| 2 |
+
nltk
|
| 3 |
+
numpy
|
| 4 |
+
scipy==1.12.0
|
| 5 |
+
gensim
|
| 6 |
+
networkx
|
| 7 |
+
textacy
|
| 8 |
+
transformers
|
| 9 |
+
torch
|
| 10 |
+
spacy-download
|
headline_gen.egg-info/top_level.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
headline_gen
|
headline_gen/Control.py
ADDED
|
@@ -0,0 +1,434 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import requests
|
| 2 |
+
import zipfile
|
| 3 |
+
import os
|
| 4 |
+
import nltk
|
| 5 |
+
|
| 6 |
+
from nltk.parse.corenlp import CoreNLPServer
|
| 7 |
+
|
| 8 |
+
from spacy_download import load_spacy
|
| 9 |
+
import textacy
|
| 10 |
+
from textacy import *
|
| 11 |
+
import string
|
| 12 |
+
|
| 13 |
+
import re
|
| 14 |
+
import numpy as np
|
| 15 |
+
from nltk.tokenize import sent_tokenize
|
| 16 |
+
from nltk.corpus import stopwords
|
| 17 |
+
from gensim.models import Word2Vec
|
| 18 |
+
from scipy.spatial import distance
|
| 19 |
+
import networkx as nx
|
| 20 |
+
|
| 21 |
+
#import string
|
| 22 |
+
from nltk.parse.corenlp import CoreNLPParser
|
| 23 |
+
from nltk.tree.tree import Tree
|
| 24 |
+
|
| 25 |
+
import torch
|
| 26 |
+
import numpy as np
|
| 27 |
+
from transformers import DistilBertTokenizerFast, DistilBertForTokenClassification
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
#import re
|
| 31 |
+
#import os
|
| 32 |
+
|
| 33 |
+
# Downloader
|
| 34 |
+
|
| 35 |
+
def Downloader():
|
| 36 |
+
directory_path = "Parser/stanford-corenlp-4.5.6"
|
| 37 |
+
if os.path.exists(directory_path) and os.listdir(directory_path):
|
| 38 |
+
pass
|
| 39 |
+
else:
|
| 40 |
+
nlp = load_spacy("en_core_web_sm")
|
| 41 |
+
nltk.download('punkt')
|
| 42 |
+
nltk.download('stopwords')
|
| 43 |
+
url = "https://nlp.stanford.edu/software/stanford-corenlp-4.5.6.zip"
|
| 44 |
+
|
| 45 |
+
filename = "stanford-corenlp-4.5.6.zip"
|
| 46 |
+
directory = "./Parser/"
|
| 47 |
+
|
| 48 |
+
os.makedirs(directory, exist_ok=True)
|
| 49 |
+
|
| 50 |
+
response = requests.get(url)
|
| 51 |
+
|
| 52 |
+
if response.status_code == 200:
|
| 53 |
+
with open(os.path.join(directory, filename), 'wb') as f:
|
| 54 |
+
f.write(response.content)
|
| 55 |
+
print("Download successful.")
|
| 56 |
+
|
| 57 |
+
with zipfile.ZipFile(os.path.join(directory, filename), 'r') as zip_ref:
|
| 58 |
+
zip_ref.extractall(directory)
|
| 59 |
+
print("Extraction successful.")
|
| 60 |
+
else:
|
| 61 |
+
print("Failed to download file.")
|
| 62 |
+
|
| 63 |
+
def ServerCntrl(Mode, Server = None):
|
| 64 |
+
Path = 'Parser/'
|
| 65 |
+
os.environ['CLASSPATH'] = os.path.join(Path, 'stanford-corenlp-4.5.6')
|
| 66 |
+
|
| 67 |
+
directory_path = "Parser/stanford-corenlp-4.5.6"
|
| 68 |
+
if os.path.exists(directory_path) and os.listdir(directory_path):
|
| 69 |
+
if Mode == "Start":
|
| 70 |
+
server = CoreNLPServer()
|
| 71 |
+
server.start()
|
| 72 |
+
return server
|
| 73 |
+
elif Mode == "Stop":
|
| 74 |
+
if Server is None:
|
| 75 |
+
print("No Server Object Provided")
|
| 76 |
+
else:
|
| 77 |
+
Server.stop()
|
| 78 |
+
else:
|
| 79 |
+
print("Un defined Operation")
|
| 80 |
+
else:
|
| 81 |
+
print("Parser Files Not Found")
|
| 82 |
+
print("Attempting to Install Parser Files (This may take a Min or Two!!)")
|
| 83 |
+
Downloader()
|
| 84 |
+
if Mode == "Start":
|
| 85 |
+
server = ServerCntrl("Start")
|
| 86 |
+
return server
|
| 87 |
+
|
| 88 |
+
# Key Phrase Extraction
|
| 89 |
+
|
| 90 |
+
def remove_punctuation(text):
|
| 91 |
+
table = str.maketrans('', '', string.punctuation)
|
| 92 |
+
return text.translate(table)
|
| 93 |
+
|
| 94 |
+
def KeyPhraseSGRank(Article):
|
| 95 |
+
en = textacy.load_spacy_lang("en_core_web_sm")
|
| 96 |
+
|
| 97 |
+
Article = remove_punctuation(Article)
|
| 98 |
+
|
| 99 |
+
doc = textacy.make_spacy_doc(Article, lang=en)
|
| 100 |
+
|
| 101 |
+
TopPhrases = [kps for kps, weights in textacy.extract.keyterms.sgrank(doc, ngrams = (1, 3), topn=1.0)]
|
| 102 |
+
if len(TopPhrases) != 0:
|
| 103 |
+
print("...Key Phrases Found...")
|
| 104 |
+
print(TopPhrases)
|
| 105 |
+
return TopPhrases
|
| 106 |
+
else:
|
| 107 |
+
print("No Specific Key Phrases Found, Terminating the Execution...")
|
| 108 |
+
exit()
|
| 109 |
+
|
| 110 |
+
# Lead Sentence Extraction
|
| 111 |
+
|
| 112 |
+
class LeadSentencesOOPS:
|
| 113 |
+
def __init__(self, df):
|
| 114 |
+
self.df = df
|
| 115 |
+
self.sentences = sent_tokenize(self.df)
|
| 116 |
+
|
| 117 |
+
def pre_process(self):
|
| 118 |
+
sentences_clean = [re.sub(r'[^\w\s]', '', sentence.lower()) for sentence in self.sentences]
|
| 119 |
+
stop_words = stopwords.words('english')
|
| 120 |
+
sentence_tokens = [[words for words in sentence.split(' ') if words not in stop_words] for sentence in sentences_clean]
|
| 121 |
+
return sentence_tokens
|
| 122 |
+
|
| 123 |
+
def count_paragraphs(self):
|
| 124 |
+
text=self.df
|
| 125 |
+
paragraphs = re.split(r'\n\s*\n', text)
|
| 126 |
+
return (paragraphs,len(paragraphs))
|
| 127 |
+
|
| 128 |
+
def word2vec(self):
|
| 129 |
+
sentence_tokens = self.pre_process()
|
| 130 |
+
w2v = Word2Vec(sentence_tokens, vector_size=1, min_count=1, epochs=1500)
|
| 131 |
+
sentence_embeddings = []
|
| 132 |
+
max_len = max(len(tokens) for tokens in sentence_tokens)
|
| 133 |
+
for words in sentence_tokens:
|
| 134 |
+
embedding = [w2v.wv[word] for word in words]
|
| 135 |
+
padding_length = max_len - len(embedding)
|
| 136 |
+
padded_embedding = np.pad(embedding, [(0, padding_length), (0, 0)], mode='constant')
|
| 137 |
+
sentence_embeddings.append(padded_embedding)
|
| 138 |
+
return sentence_embeddings
|
| 139 |
+
|
| 140 |
+
def similarity_matrix(self):
|
| 141 |
+
sentence_tokens = self.pre_process()
|
| 142 |
+
sentence_embeddings = self.word2vec()
|
| 143 |
+
similarity_matrix = np.zeros([len(sentence_tokens), len(sentence_tokens)])
|
| 144 |
+
for i, row_embedding in enumerate(sentence_embeddings):
|
| 145 |
+
for j, column_embedding in enumerate(sentence_embeddings):
|
| 146 |
+
similarity_matrix[i][j] = 1 - distance.cosine(row_embedding.ravel(), column_embedding.ravel())
|
| 147 |
+
return similarity_matrix
|
| 148 |
+
|
| 149 |
+
def num_of_leadingsentences(self):
|
| 150 |
+
num_sentences = len(self.sentences)
|
| 151 |
+
if num_sentences < 5:
|
| 152 |
+
top = 1
|
| 153 |
+
elif num_sentences < 10:
|
| 154 |
+
top = 2
|
| 155 |
+
elif num_sentences < 25:
|
| 156 |
+
top = 4
|
| 157 |
+
elif num_sentences < 50:
|
| 158 |
+
top = 9
|
| 159 |
+
elif num_sentences < 100:
|
| 160 |
+
top = 18
|
| 161 |
+
elif num_sentences < 200:
|
| 162 |
+
top = 25
|
| 163 |
+
elif num_sentences >= 201:
|
| 164 |
+
top = 40
|
| 165 |
+
return top
|
| 166 |
+
|
| 167 |
+
def text_rank(self,num_sentences_to_extract):
|
| 168 |
+
li=[]
|
| 169 |
+
similarity_matrixs = self.similarity_matrix()
|
| 170 |
+
nx_graph = nx.from_numpy_array(similarity_matrixs)
|
| 171 |
+
scores = nx.pagerank(nx_graph)
|
| 172 |
+
top_sentence = {sentence: scores[index] for index, sentence in enumerate(self.sentences)}
|
| 173 |
+
top = dict(sorted(top_sentence.items(), key=lambda x: x[1], reverse=True)[:num_sentences_to_extract])
|
| 174 |
+
for sent in self.sentences:
|
| 175 |
+
if sent in top.keys():
|
| 176 |
+
li.append(sent)
|
| 177 |
+
return li
|
| 178 |
+
|
| 179 |
+
def leading_sentences(self):
|
| 180 |
+
article_info = self.count_paragraphs()
|
| 181 |
+
leading_sentences=[]
|
| 182 |
+
#if there is only one para in article then num_of_leading sentences are selected based on fixed constant
|
| 183 |
+
if article_info[1] <= 3:
|
| 184 |
+
num_sentences_to_extract=self.num_of_leadingsentences()
|
| 185 |
+
LSG_article = LeadSentencesOOPS(str(article_info[0]))
|
| 186 |
+
leading_sentences.extend(LSG_article.text_rank(num_sentences_to_extract))
|
| 187 |
+
#leading_sentences_corpus.append(leading_sentences)
|
| 188 |
+
else:
|
| 189 |
+
num_sentences_to_extract=1 #if there are more than one paras in article
|
| 190 |
+
paragraphs = article_info[0]
|
| 191 |
+
#print("num_paras: ",paragraphs)
|
| 192 |
+
#extracting one leading sentence from each paragraph
|
| 193 |
+
for para in paragraphs:
|
| 194 |
+
LSG = LeadSentencesOOPS(para)
|
| 195 |
+
output = LSG.text_rank(num_sentences_to_extract)
|
| 196 |
+
leading_sentences.extend(output)
|
| 197 |
+
#extractig leading sentence from entire article
|
| 198 |
+
LSG_article = LeadSentencesOOPS(para)
|
| 199 |
+
leading_sentences.extend(LSG_article.text_rank(num_sentences_to_extract))
|
| 200 |
+
|
| 201 |
+
return leading_sentences
|
| 202 |
+
|
| 203 |
+
# Parsing and Compression Algo
|
| 204 |
+
|
| 205 |
+
def remove_punctuation(text):
|
| 206 |
+
table = str.maketrans('', '', string.punctuation)
|
| 207 |
+
return text.translate(table)
|
| 208 |
+
|
| 209 |
+
def Parsing(Sentence, Server):
|
| 210 |
+
parser = CoreNLPParser(url=Server.url)
|
| 211 |
+
return next(parser.raw_parse(Sentence))
|
| 212 |
+
|
| 213 |
+
def find_leftmost_S(tree):
|
| 214 |
+
if isinstance(tree, str): # Terminal node
|
| 215 |
+
return None
|
| 216 |
+
elif tree.label() == 'S': # Found leftmost S node
|
| 217 |
+
return tree
|
| 218 |
+
else:
|
| 219 |
+
for subtree in tree:
|
| 220 |
+
result = find_leftmost_S(subtree)
|
| 221 |
+
if result is not None:
|
| 222 |
+
return result
|
| 223 |
+
|
| 224 |
+
def Pruning(tree, Label):
|
| 225 |
+
if isinstance(tree, str):
|
| 226 |
+
return tree
|
| 227 |
+
if tree.height() > 0:
|
| 228 |
+
filtered_children = [Pruning(child, Label) for child in tree if (isinstance(child, str) or child.height() > 0) and (isinstance(child, str) or child.label() != Label)]
|
| 229 |
+
return Tree(tree.label(), filtered_children)
|
| 230 |
+
else:
|
| 231 |
+
return tree
|
| 232 |
+
|
| 233 |
+
def IterativeTrimming(HeadLine, SGRankList, Threshold):
|
| 234 |
+
if len(HeadLine) > Threshold:
|
| 235 |
+
if len(SGRankList) > 0:
|
| 236 |
+
ptr = SGRankList[-1]
|
| 237 |
+
else:
|
| 238 |
+
return HeadLine
|
| 239 |
+
if HeadLine.find(ptr) > 0:
|
| 240 |
+
if HeadLine[HeadLine.find(ptr) - 1] != ' ':
|
| 241 |
+
HeadLine = HeadLine.replace(ptr, ":", 1)
|
| 242 |
+
else:
|
| 243 |
+
HeadLine = HeadLine.replace(' ' + ptr, "", 1)
|
| 244 |
+
else:
|
| 245 |
+
HeadLine = HeadLine.replace(ptr + ' ', "", 1)
|
| 246 |
+
return IterativeTrimming(HeadLine, SGRankList[: len(SGRankList) - 1], Threshold)
|
| 247 |
+
else:
|
| 248 |
+
return HeadLine
|
| 249 |
+
|
| 250 |
+
def Extract(Treex):
|
| 251 |
+
k = Treex.leaves()
|
| 252 |
+
Trex = ''
|
| 253 |
+
for i in k:
|
| 254 |
+
Trex += i + ' '
|
| 255 |
+
return Trex
|
| 256 |
+
|
| 257 |
+
def CompressionAlgorithm(LeadSents, TopPhrases, server):
|
| 258 |
+
CompressedSentences = []
|
| 259 |
+
for i in LeadSents:
|
| 260 |
+
Suppy = remove_punctuation(i)
|
| 261 |
+
|
| 262 |
+
ParsedSentence = Parsing(Suppy, server)
|
| 263 |
+
|
| 264 |
+
for i in ParsedSentence:
|
| 265 |
+
for j in i:
|
| 266 |
+
lefts = find_leftmost_S(j)
|
| 267 |
+
if lefts is not None:
|
| 268 |
+
LeftMostS = lefts
|
| 269 |
+
else:
|
| 270 |
+
LeftMostS = i
|
| 271 |
+
break
|
| 272 |
+
|
| 273 |
+
Labels = [ 'SBAR', 'DT', 'TMP', 'CC']
|
| 274 |
+
for i in Labels:
|
| 275 |
+
Temp = Pruning(LeftMostS, i)
|
| 276 |
+
LeftMostS = Temp
|
| 277 |
+
|
| 278 |
+
Trex = Extract(Temp)
|
| 279 |
+
Kalix = IterativeTrimming(Trex, TopPhrases, 120)
|
| 280 |
+
|
| 281 |
+
'''PS = Parsing(Kalix, server)
|
| 282 |
+
Tk = Pruning(PS, 'SBAR')
|
| 283 |
+
|
| 284 |
+
Trex = Extract(Tk)'''
|
| 285 |
+
|
| 286 |
+
CompressedSentences.append(Kalix)
|
| 287 |
+
return CompressedSentences
|
| 288 |
+
|
| 289 |
+
# Key Phrase Matching and Ranking
|
| 290 |
+
|
| 291 |
+
def SGRMatching(HeadLine, TopPhrases):
|
| 292 |
+
l, Flag, itre = len(TopPhrases), 0.0, 0
|
| 293 |
+
for Phrase in TopPhrases:
|
| 294 |
+
if Phrase in HeadLine:
|
| 295 |
+
Flag += (l - TopPhrases.index(Phrase)) / l
|
| 296 |
+
itre += 1
|
| 297 |
+
return (itre * Flag) / l
|
| 298 |
+
'''
|
| 299 |
+
if itre != 0:
|
| 300 |
+
return Flag / itre
|
| 301 |
+
else:
|
| 302 |
+
return -1'''
|
| 303 |
+
|
| 304 |
+
def Ranking(CompressedSentences, KeyPhrases):
|
| 305 |
+
ResultDict = {}
|
| 306 |
+
for i in CompressedSentences:
|
| 307 |
+
ResultDict[i] = SGRMatching(i, KeyPhrases)
|
| 308 |
+
return ResultDict
|
| 309 |
+
|
| 310 |
+
# Post Processing using DistilBert
|
| 311 |
+
|
| 312 |
+
#
|
| 313 |
+
# Split text to segments of length 200, with overlap 50
|
| 314 |
+
#
|
| 315 |
+
def split_to_segments(wrds, length, overlap):
|
| 316 |
+
resp = []
|
| 317 |
+
i = 0
|
| 318 |
+
while True:
|
| 319 |
+
wrds_split = wrds[(length * i):((length * (i + 1)) + overlap)]
|
| 320 |
+
if not wrds_split:
|
| 321 |
+
break
|
| 322 |
+
|
| 323 |
+
resp_obj = {
|
| 324 |
+
"text": wrds_split,
|
| 325 |
+
"start_idx": length * i,
|
| 326 |
+
"end_idx": (length * (i + 1)) + overlap,
|
| 327 |
+
}
|
| 328 |
+
|
| 329 |
+
resp.append(resp_obj)
|
| 330 |
+
i += 1
|
| 331 |
+
return resp
|
| 332 |
+
|
| 333 |
+
|
| 334 |
+
#
|
| 335 |
+
# Punctuate wordpieces
|
| 336 |
+
#
|
| 337 |
+
def punctuate_wordpiece(wordpiece, label):
|
| 338 |
+
if label.startswith('UPPER'):
|
| 339 |
+
wordpiece = wordpiece.upper()
|
| 340 |
+
elif label.startswith('Upper'):
|
| 341 |
+
wordpiece = wordpiece[0].upper() + wordpiece[1:]
|
| 342 |
+
if label[-1] != '_' and label[-1] != wordpiece[-1]:
|
| 343 |
+
wordpiece += label[-1]
|
| 344 |
+
return wordpiece
|
| 345 |
+
|
| 346 |
+
|
| 347 |
+
#
|
| 348 |
+
# Punctuate text segments (200 words)
|
| 349 |
+
#
|
| 350 |
+
def punctuate_segment(wordpieces, word_ids, labels, start_word):
|
| 351 |
+
result = ''
|
| 352 |
+
for idx in range(0, len(wordpieces)):
|
| 353 |
+
if word_ids[idx] == None:
|
| 354 |
+
continue
|
| 355 |
+
if word_ids[idx] < start_word:
|
| 356 |
+
continue
|
| 357 |
+
wordpiece = punctuate_wordpiece(wordpieces[idx][2:] if wordpieces[idx].startswith('##') else wordpieces[idx],
|
| 358 |
+
labels[idx])
|
| 359 |
+
if idx > 0 and len(result) > 0 and word_ids[idx] != word_ids[idx - 1] and result[-1] != '-':
|
| 360 |
+
result += ' '
|
| 361 |
+
result += wordpiece
|
| 362 |
+
return result
|
| 363 |
+
|
| 364 |
+
|
| 365 |
+
#
|
| 366 |
+
# Tokenize, predict, punctuate text segments (200 words)
|
| 367 |
+
#
|
| 368 |
+
def process_segment(words, tokenizer, model, start_word, encoder_max_length):
|
| 369 |
+
|
| 370 |
+
tokens = tokenizer(words['text'],
|
| 371 |
+
padding="max_length",
|
| 372 |
+
# truncation=True,
|
| 373 |
+
max_length=encoder_max_length,
|
| 374 |
+
is_split_into_words=True, return_tensors='pt')
|
| 375 |
+
|
| 376 |
+
with torch.no_grad():
|
| 377 |
+
logits = model(**tokens).logits
|
| 378 |
+
logits = logits.cpu()
|
| 379 |
+
predictions = np.argmax(logits, axis=-1)
|
| 380 |
+
|
| 381 |
+
wordpieces = tokens.tokens()
|
| 382 |
+
word_ids = tokens.word_ids()
|
| 383 |
+
id2label = model.config.id2label
|
| 384 |
+
labels = [[id2label[p.item()] for p in prediction] for prediction in predictions][0]
|
| 385 |
+
|
| 386 |
+
return punctuate_segment(wordpieces, word_ids, labels, start_word)
|
| 387 |
+
|
| 388 |
+
|
| 389 |
+
#
|
| 390 |
+
# Punctuate text of any length
|
| 391 |
+
#
|
| 392 |
+
def punctuate(text, tokenizer, model, encoder_max_length):
|
| 393 |
+
text = text.lower()
|
| 394 |
+
text = text.replace('\n', ' ')
|
| 395 |
+
words = text.split(' ')
|
| 396 |
+
|
| 397 |
+
overlap = 50
|
| 398 |
+
slices = split_to_segments(words, 150, 50)
|
| 399 |
+
|
| 400 |
+
result = ""
|
| 401 |
+
start_word = 0
|
| 402 |
+
for text in slices:
|
| 403 |
+
corrected = process_segment(text, tokenizer, model, start_word, encoder_max_length)
|
| 404 |
+
result += corrected + ' '
|
| 405 |
+
start_word = overlap
|
| 406 |
+
return result
|
| 407 |
+
|
| 408 |
+
def PostProcess(Sentence):
|
| 409 |
+
checkpoint = "venkatchoudharyala/Punctuate"
|
| 410 |
+
tokenizer = DistilBertTokenizerFast.from_pretrained(checkpoint)
|
| 411 |
+
model = DistilBertForTokenClassification.from_pretrained(checkpoint)
|
| 412 |
+
encoder_max_length = 256
|
| 413 |
+
return punctuate(Sentence, tokenizer, model, encoder_max_length)
|
| 414 |
+
|
| 415 |
+
def Generate(Article, Server):
|
| 416 |
+
cleaned_article = re.sub(r'\([^)]*\)', '', Article)
|
| 417 |
+
|
| 418 |
+
KeyPhrases = KeyPhraseSGRank(cleaned_article)
|
| 419 |
+
|
| 420 |
+
LSG = LeadSentencesOOPS(cleaned_article)
|
| 421 |
+
LeadingSentences = LSG.leading_sentences()
|
| 422 |
+
#LeadingSentences = leading_sentences(cleaned_article)
|
| 423 |
+
#LeadingSentences = get_first_sentences(cleaned_article)
|
| 424 |
+
print("...Leading Sentences Found...")
|
| 425 |
+
print(LeadingSentences)
|
| 426 |
+
|
| 427 |
+
CompressedSentences = CompressionAlgorithm(LeadingSentences, KeyPhrases, Server)
|
| 428 |
+
|
| 429 |
+
ResultDict = Ranking(CompressedSentences, KeyPhrases)
|
| 430 |
+
|
| 431 |
+
max_key = max(ResultDict, key=lambda k: ResultDict[k])
|
| 432 |
+
print("...Scores of Sentences...")
|
| 433 |
+
print(ResultDict)
|
| 434 |
+
return PostProcess(max_key)
|
headline_gen/__init__.py
ADDED
|
File without changes
|
setup.py
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from setuptools import setup, find_packages
|
| 2 |
+
|
| 3 |
+
setup(
|
| 4 |
+
name='headline-gen',
|
| 5 |
+
version='2.6',
|
| 6 |
+
author='venkatchoudharyala',
|
| 7 |
+
author_email='venkatchoudhary.ala@gmail.com',
|
| 8 |
+
description='Provides functionality to generate headlines from articles using natural language processing techniques.',
|
| 9 |
+
long_description=open('README.md').read(), # Read the contents of README.md
|
| 10 |
+
long_description_content_type='text/markdown', # Specify the content type of the l
|
| 11 |
+
install_requires=[
|
| 12 |
+
'requests',
|
| 13 |
+
'nltk',
|
| 14 |
+
'numpy',
|
| 15 |
+
'scipy==1.12.0',
|
| 16 |
+
'gensim',
|
| 17 |
+
'networkx',
|
| 18 |
+
'textacy',
|
| 19 |
+
'transformers',
|
| 20 |
+
'torch',
|
| 21 |
+
'spacy-download'
|
| 22 |
+
],
|
| 23 |
+
python_requires='>=3.6',
|
| 24 |
+
)
|