Spaces:
Running
Running
Commit ·
4673e91
1
Parent(s): bc1927c
progress more 31+
Browse files- app.py +74 -10
- sample_file.xlsx +0 -0
app.py
CHANGED
|
@@ -3,7 +3,6 @@ import pandas as pd
|
|
| 3 |
import time
|
| 4 |
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
|
| 5 |
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
|
| 6 |
-
#from transformers import MarianMTModel, MarianTokenizer
|
| 7 |
import matplotlib.pyplot as plt
|
| 8 |
from pymystem3 import Mystem
|
| 9 |
import io
|
|
@@ -11,6 +10,7 @@ from rapidfuzz import fuzz
|
|
| 11 |
from tqdm.auto import tqdm
|
| 12 |
import time
|
| 13 |
import torch
|
|
|
|
| 14 |
|
| 15 |
# Initialize pymystem3 for lemmatization
|
| 16 |
mystem = Mystem()
|
|
@@ -107,6 +107,7 @@ def fuzzy_deduplicate(df, column, threshold=65):
|
|
| 107 |
|
| 108 |
|
| 109 |
def process_file(uploaded_file):
|
|
|
|
| 110 |
df = pd.read_excel(uploaded_file, sheet_name='Публикации')
|
| 111 |
|
| 112 |
original_news_count = len(df)
|
|
@@ -162,8 +163,75 @@ def process_file(uploaded_file):
|
|
| 162 |
|
| 163 |
return df
|
| 164 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 165 |
def main():
|
| 166 |
-
st.title("... приступим к анализу... версия
|
| 167 |
|
| 168 |
uploaded_file = st.file_uploader("Выбирайте Excel-файл", type="xlsx")
|
| 169 |
|
|
@@ -177,7 +245,7 @@ def main():
|
|
| 177 |
fig, axs = plt.subplots(2, 2, figsize=(12, 8))
|
| 178 |
fig.suptitle("Распределение окраски по моделям")
|
| 179 |
|
| 180 |
-
models = ['
|
| 181 |
for i, model in enumerate(models):
|
| 182 |
ax = axs[i // 2, i % 2]
|
| 183 |
sentiment_counts = df[model].value_counts()
|
|
@@ -190,16 +258,12 @@ def main():
|
|
| 190 |
st.pyplot(fig)
|
| 191 |
|
| 192 |
# Offer download of results
|
| 193 |
-
output =
|
| 194 |
-
with pd.ExcelWriter(output, engine='openpyxl') as writer:
|
| 195 |
-
df.to_excel(writer, index=False)
|
| 196 |
-
output.seek(0)
|
| 197 |
st.download_button(
|
| 198 |
-
label="
|
| 199 |
data=output,
|
| 200 |
-
file_name="
|
| 201 |
mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
| 202 |
)
|
| 203 |
-
|
| 204 |
if __name__ == "__main__":
|
| 205 |
main()
|
|
|
|
| 3 |
import time
|
| 4 |
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
|
| 5 |
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
|
|
|
|
| 6 |
import matplotlib.pyplot as plt
|
| 7 |
from pymystem3 import Mystem
|
| 8 |
import io
|
|
|
|
| 10 |
from tqdm.auto import tqdm
|
| 11 |
import time
|
| 12 |
import torch
|
| 13 |
+
from openpyxl import load_workbook
|
| 14 |
|
| 15 |
# Initialize pymystem3 for lemmatization
|
| 16 |
mystem = Mystem()
|
|
|
|
| 107 |
|
| 108 |
|
| 109 |
def process_file(uploaded_file):
|
| 110 |
+
|
| 111 |
df = pd.read_excel(uploaded_file, sheet_name='Публикации')
|
| 112 |
|
| 113 |
original_news_count = len(df)
|
|
|
|
| 163 |
|
| 164 |
return df
|
| 165 |
|
| 166 |
+
def create_output_file(df):
|
| 167 |
+
# Create a new Excel writer object
|
| 168 |
+
output = io.BytesIO()
|
| 169 |
+
writer = pd.ExcelWriter(output, engine='openpyxl')
|
| 170 |
+
|
| 171 |
+
# Load the sample file to copy its structure
|
| 172 |
+
sample_wb = load_workbook("sample_file.xlsx")
|
| 173 |
+
|
| 174 |
+
# Process data for 'Сводка' sheet
|
| 175 |
+
entities = df['Объект'].unique()
|
| 176 |
+
summary_data = []
|
| 177 |
+
for entity in entities:
|
| 178 |
+
entity_df = df[df['Объект'] == entity]
|
| 179 |
+
total_news = len(entity_df)
|
| 180 |
+
negative_news = sum((entity_df['FinBERT'] == 'Negative') |
|
| 181 |
+
(entity_df['RoBERTa'] == 'Negative') |
|
| 182 |
+
(entity_df['FinBERT-Tone'] == 'Negative'))
|
| 183 |
+
positive_news = sum((entity_df['FinBERT'] == 'Positive') |
|
| 184 |
+
(entity_df['RoBERTa'] == 'Positive') |
|
| 185 |
+
(entity_df['FinBERT-Tone'] == 'Positive'))
|
| 186 |
+
summary_data.append([entity, total_news, negative_news, positive_news])
|
| 187 |
+
|
| 188 |
+
summary_df = pd.DataFrame(summary_data, columns=['Объект', 'Всего новостей', 'Отрицательные', 'Положительные'])
|
| 189 |
+
summary_df = summary_df.sort_values('Отрицательные', ascending=False)
|
| 190 |
+
|
| 191 |
+
# Write 'Сводка' sheet
|
| 192 |
+
if 'Сводка' in sample_wb.sheetnames:
|
| 193 |
+
writer.book['Сводка'] = sample_wb['Сводка']
|
| 194 |
+
summary_df.to_excel(writer, sheet_name='Сводка', startrow=3, startcol=4, index=False, header=False)
|
| 195 |
+
|
| 196 |
+
# Process data for 'Значимые' and 'Анализ' sheets
|
| 197 |
+
significant_data = []
|
| 198 |
+
analysis_data = []
|
| 199 |
+
for _, row in df.iterrows():
|
| 200 |
+
if any(row[model] in ['Negative', 'Positive'] for model in ['FinBERT', 'RoBERTa', 'FinBERT-Tone']):
|
| 201 |
+
sentiment = 'Negative' if any(row[model] == 'Negative' for model in ['FinBERT', 'RoBERTa', 'FinBERT-Tone']) else 'Positive'
|
| 202 |
+
significant_data.append([row['Объект'], sentiment, row['Заголовок'], row['Выдержки из текста']])
|
| 203 |
+
|
| 204 |
+
if any(row[model] == 'Negative' for model in ['FinBERT', 'RoBERTa', 'FinBERT-Tone']):
|
| 205 |
+
analysis_data.append([row['Объект'], 'РИСК УБЫТКА', row['Заголовок'], row['Выдержки из текста']])
|
| 206 |
+
|
| 207 |
+
# Write 'Значимые' sheet
|
| 208 |
+
if 'Значимые' in sample_wb.sheetnames:
|
| 209 |
+
writer.book['Значимые'] = sample_wb['Значимые']
|
| 210 |
+
significant_df = pd.DataFrame(significant_data, columns=['Объект', 'Окраска', 'Заголовок', 'Текст'])
|
| 211 |
+
significant_df.to_excel(writer, sheet_name='Значимые', startrow=2, startcol=2, index=False)
|
| 212 |
+
|
| 213 |
+
# Write 'Анализ' sheet
|
| 214 |
+
if 'Анализ' in sample_wb.sheetnames:
|
| 215 |
+
writer.book['Анализ'] = sample_wb['Анализ']
|
| 216 |
+
analysis_df = pd.DataFrame(analysis_data, columns=['Объект', 'Тип риска', 'Заголовок', 'Текст'])
|
| 217 |
+
analysis_df.to_excel(writer, sheet_name='Анализ', startrow=3, startcol=4, index=False)
|
| 218 |
+
|
| 219 |
+
# Copy 'Публикации' sheet from original file
|
| 220 |
+
if 'Публикации' in sample_wb.sheetnames:
|
| 221 |
+
writer.book['Публикации'] = sample_wb['Публикации']
|
| 222 |
+
df.to_excel(writer, sheet_name='Публикации', index=False)
|
| 223 |
+
|
| 224 |
+
# Add 'Тех.приложение' sheet
|
| 225 |
+
df.to_excel(writer, sheet_name='Тех.приложение', index=False)
|
| 226 |
+
|
| 227 |
+
writer.save()
|
| 228 |
+
output.seek(0)
|
| 229 |
+
|
| 230 |
+
return output
|
| 231 |
+
|
| 232 |
+
|
| 233 |
def main():
|
| 234 |
+
st.title("... приступим к анализу... версия 31+")
|
| 235 |
|
| 236 |
uploaded_file = st.file_uploader("Выбирайте Excel-файл", type="xlsx")
|
| 237 |
|
|
|
|
| 245 |
fig, axs = plt.subplots(2, 2, figsize=(12, 8))
|
| 246 |
fig.suptitle("Распределение окраски по моделям")
|
| 247 |
|
| 248 |
+
models = ['ruBERT2','FinBERT', 'RoBERTa', 'FinBERT-Tone']
|
| 249 |
for i, model in enumerate(models):
|
| 250 |
ax = axs[i // 2, i % 2]
|
| 251 |
sentiment_counts = df[model].value_counts()
|
|
|
|
| 258 |
st.pyplot(fig)
|
| 259 |
|
| 260 |
# Offer download of results
|
| 261 |
+
output = create_output_file(df)
|
|
|
|
|
|
|
|
|
|
| 262 |
st.download_button(
|
| 263 |
+
label="Скачать результат анализа новостей",
|
| 264 |
data=output,
|
| 265 |
+
file_name="результат_анализа_новостей.xlsx",
|
| 266 |
mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
| 267 |
)
|
|
|
|
| 268 |
if __name__ == "__main__":
|
| 269 |
main()
|
sample_file.xlsx
ADDED
|
Binary file (134 kB). View file
|
|
|