Spaces:
Build error
Build error
Update app.py
Browse files
app.py
CHANGED
|
@@ -13,6 +13,21 @@ from transformers import pipeline
|
|
| 13 |
import itertools
|
| 14 |
import pandas as pd
|
| 15 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
|
| 17 |
OUT_HEADERS = ['E','S','G']
|
| 18 |
DF_SP500 = pd.read_csv('SP500_constituents.zip',compression=dict(method='zip'))
|
|
@@ -20,11 +35,82 @@ DF_SP500 = pd.read_csv('SP500_constituents.zip',compression=dict(method='zip'))
|
|
| 20 |
MODEL_TRANSFORMER_BASED = "distilbert-base-uncased"
|
| 21 |
MODEL_ONNX_FNAME = "ESG_classifier_batch.onnx"
|
| 22 |
MODEL_SENTIMENT_ANALYSIS = "ProsusAI/finbert"
|
| 23 |
-
#MODEL_SUMMARY_PEGASUS = "oMateos2020/pegasus-newsroom-cnn_full-adafactor-bs6"
|
| 24 |
|
| 25 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
|
| 27 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
|
| 29 |
def get_company_sectors(extracted_names, threshold=0.95):
|
| 30 |
'''
|
|
@@ -198,12 +284,12 @@ def inference(input_batch,isurl,use_archive,limit_companies=10):
|
|
| 198 |
prob_outs = _inference_classifier(input_batch_content)
|
| 199 |
print("[i] Classifier output shape:",prob_outs.shape)
|
| 200 |
print("[i] Running sentiment using",MODEL_SENTIMENT_ANALYSIS ,"inference...")
|
| 201 |
-
#sentiment = _inference_sentiment_model_via_api_query({"inputs": extracted['content']})
|
| 202 |
sentiment = _inference_sentiment_model_pipeline(input_batch_content )
|
| 203 |
print("[i] Running NER using custom spancat inference...")
|
| 204 |
-
#summary = _inference_summary_model_pipeline(input_batch_content )[0]['generated_text']
|
| 205 |
ner_labels = _inference_ner_spancat(input_batch_content ,limit_outputs=limit_companies)
|
| 206 |
-
print(
|
|
|
|
|
|
|
| 207 |
df = pd.DataFrame(prob_outs,columns =['E','S','G'])
|
| 208 |
if isurl:
|
| 209 |
df['URL'] = url_list
|
|
@@ -211,6 +297,7 @@ def inference(input_batch,isurl,use_archive,limit_companies=10):
|
|
| 211 |
df['content_id'] = range(1, len(input_batch_r)+1)
|
| 212 |
df['sent_lbl'] = [d['label'] for d in sentiment ]
|
| 213 |
df['sent_score'] = [d['score'] for d in sentiment ]
|
|
|
|
| 214 |
print("[i] Pandas output shape:",df.shape)
|
| 215 |
|
| 216 |
#[[], [('Nvidia', 'Information Technology')], [('Twitter', 'Communication Services'), ('Apple', 'Information Technology')], [], [], [], [], [], []]
|
|
|
|
| 13 |
import itertools
|
| 14 |
import pandas as pd
|
| 15 |
|
| 16 |
+
from bertopic import BERTopic
|
| 17 |
+
from huggingface_hub import hf_hub_url, cached_download
|
| 18 |
+
|
| 19 |
+
import nltk
|
| 20 |
+
nltk.download('stopwords')
|
| 21 |
+
nltk.download('wordnet')
|
| 22 |
+
nltk.download('omw-1.4')
|
| 23 |
+
from nltk.corpus import stopwords
|
| 24 |
+
from nltk.stem import WordNetLemmatizer
|
| 25 |
+
from nltk.stem import PorterStemmer
|
| 26 |
+
|
| 27 |
+
from unicodedata import normalize
|
| 28 |
+
|
| 29 |
+
import re
|
| 30 |
+
|
| 31 |
|
| 32 |
OUT_HEADERS = ['E','S','G']
|
| 33 |
DF_SP500 = pd.read_csv('SP500_constituents.zip',compression=dict(method='zip'))
|
|
|
|
| 35 |
MODEL_TRANSFORMER_BASED = "distilbert-base-uncased"
|
| 36 |
MODEL_ONNX_FNAME = "ESG_classifier_batch.onnx"
|
| 37 |
MODEL_SENTIMENT_ANALYSIS = "ProsusAI/finbert"
|
|
|
|
| 38 |
|
| 39 |
|
| 40 |
+
BERTOPIC_REPO_ID = "oMateos2020/BERTopic-paraphrase-MiniLM-L3-v2-51topics-guided-model3"
|
| 41 |
+
BERTOPIC_FILENAME = "BERTopic-paraphrase-MiniLM-L3-v2-51topics-guided-model3"
|
| 42 |
+
bertopic_model = BERTopic.load(cached_download(hf_hub_url(REPO_ID, FILENAME)), embedding_model="paraphrase-MiniLM-L3-v2")
|
| 43 |
+
|
| 44 |
+
def _topic_sanitize_word(text):
|
| 45 |
+
"""Función realiza una primera limpieza-normalización del texto a traves de expresiones regex"""
|
| 46 |
+
text = re.sub(r'@[\w_]+|#[\w_]+|https?://[\w_./]+', '', text) # Elimina menciones y URL, esto sería más para Tweets pero por si hay alguna mención o URL al ser criticas web
|
| 47 |
+
text = re.sub('\S*@\S*\s?', '', text) # Elimina correos electronicos
|
| 48 |
+
text = re.sub(r'\((\d+)\)', '', text) #Elimina numeros entre parentesis
|
| 49 |
+
text = re.sub(r'^\d+', '', text) #Elimina numeros sueltos
|
| 50 |
+
text = re.sub(r'\n', '', text) #Elimina saltos de linea
|
| 51 |
+
text = re.sub('\s+', ' ', text) # Elimina espacios en blanco adicionales
|
| 52 |
+
text = re.sub(r'[“”]', '', text) # Elimina caracter citas
|
| 53 |
+
text = re.sub(r'[()]', '', text) # Elimina parentesis
|
| 54 |
+
text = re.sub('\.', '', text) # Elimina punto
|
| 55 |
+
text = re.sub('\,', '', text) # Elimina coma
|
| 56 |
+
text = re.sub('’s', '', text) # Elimina posesivos
|
| 57 |
+
#text = re.sub(r'-+', '', text) # Quita guiones para unir palabras compuestas (normalizaría algunos casos, exmujer y ex-mujer, todos a exmujer)
|
| 58 |
+
text = re.sub(r'\.{3}', ' ', text) # Reemplaza puntos suspensivos
|
| 59 |
+
# Esta exp regular se ha incluido "a mano" tras ver que era necesaria para algunos ejemplos
|
| 60 |
+
text = re.sub(r"([\.\?])", r"\1 ", text) # Introduce espacio despues de punto e interrogacion
|
| 61 |
+
# -> NFD (Normalization Form Canonical Decomposition) y eliminar diacríticos
|
| 62 |
+
text = re.sub(r"([^n\u0300-\u036f]|n(?!\u0303(?![\u0300-\u036f])))[\u0300-\u036f]+", r"\1",
|
| 63 |
+
normalize( "NFD", text), 0, re.I) # Eliminación de diacriticos (acentos y variantes puntuadas de caracteres por su forma simple excepto la 'ñ')
|
| 64 |
+
# -> NFC (Normalization Form Canonical Composition)
|
| 65 |
+
text = normalize( 'NFC', text)
|
| 66 |
+
|
| 67 |
+
return text.lower().strip()
|
| 68 |
+
|
| 69 |
+
def _topic_clean_text(text, lemmatize=True, stem=True):
|
| 70 |
+
words = text.split()
|
| 71 |
+
non_stopwords = [word for word in words if word not in stopwords.words('english')]
|
| 72 |
+
clean_text = [_topic_sanitize_word(word) for word in non_stopwords]
|
| 73 |
+
if lemmatize:
|
| 74 |
+
lemmatizer = WordNetLemmatizer()
|
| 75 |
+
clean_text = [lemmatizer.lemmatize(word) for word in clean_text]
|
| 76 |
+
if stem:
|
| 77 |
+
ps =PorterStemmer()
|
| 78 |
+
clean_text = [ps.stem(word) for word in clean_text]
|
| 79 |
|
| 80 |
+
return ' '.join(clean_text).strip()
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
#SECTOR_LIST = list(DF_SP500.Sector.unique())
|
| 84 |
+
SECTOR_LIST = ['Industry',
|
| 85 |
+
'Health',
|
| 86 |
+
'Technology',
|
| 87 |
+
'Communication',
|
| 88 |
+
'Consumer Staples',
|
| 89 |
+
'Consumer Discretionary',
|
| 90 |
+
'Utilities',
|
| 91 |
+
'Financials',
|
| 92 |
+
'Materials',
|
| 93 |
+
'Real Estate',
|
| 94 |
+
'Energy']
|
| 95 |
+
|
| 96 |
+
SECTOR_TOPICS = []
|
| 97 |
+
for sector in SECTOR_LIST:
|
| 98 |
+
topics, _ = bertopic_model.find_topics(_topic_clean_text(sector), top_n=5)
|
| 99 |
+
SECTOR_TOPICS.append(topics)
|
| 100 |
+
|
| 101 |
+
def _topic2sector(pred_topics):
|
| 102 |
+
out = []
|
| 103 |
+
for pred_topic in pred_topics:
|
| 104 |
+
relevant_sectors = []
|
| 105 |
+
for i in range(len(SECTOR_LIST)):
|
| 106 |
+
if pred_topic in SECTOR_TOPICS[i]:
|
| 107 |
+
relevant_sectors.append(list(DF_SP500.Sector.unique())[i])
|
| 108 |
+
out.append(relevant_sectors)
|
| 109 |
+
return out
|
| 110 |
+
|
| 111 |
+
def _inference_topic_match(text):
|
| 112 |
+
out, _ = bertopic_model.transform([_topic_clean_text(t) for t in text])
|
| 113 |
+
return out
|
| 114 |
|
| 115 |
def get_company_sectors(extracted_names, threshold=0.95):
|
| 116 |
'''
|
|
|
|
| 284 |
prob_outs = _inference_classifier(input_batch_content)
|
| 285 |
print("[i] Classifier output shape:",prob_outs.shape)
|
| 286 |
print("[i] Running sentiment using",MODEL_SENTIMENT_ANALYSIS ,"inference...")
|
|
|
|
| 287 |
sentiment = _inference_sentiment_model_pipeline(input_batch_content )
|
| 288 |
print("[i] Running NER using custom spancat inference...")
|
|
|
|
| 289 |
ner_labels = _inference_ner_spancat(input_batch_content ,limit_outputs=limit_companies)
|
| 290 |
+
print("[i] BERTopic...")
|
| 291 |
+
topics = _inference_topic_match(input_batch_content)
|
| 292 |
+
|
| 293 |
df = pd.DataFrame(prob_outs,columns =['E','S','G'])
|
| 294 |
if isurl:
|
| 295 |
df['URL'] = url_list
|
|
|
|
| 297 |
df['content_id'] = range(1, len(input_batch_r)+1)
|
| 298 |
df['sent_lbl'] = [d['label'] for d in sentiment ]
|
| 299 |
df['sent_score'] = [d['score'] for d in sentiment ]
|
| 300 |
+
df['sector_pred'] = pd.DataFrame(_topic2sector(topics)).iloc[:, 0]
|
| 301 |
print("[i] Pandas output shape:",df.shape)
|
| 302 |
|
| 303 |
#[[], [('Nvidia', 'Information Technology')], [('Twitter', 'Communication Services'), ('Apple', 'Information Technology')], [], [], [], [], [], []]
|