Spaces:
Running
Running
Commit ·
0b49979
1
Parent(s): 8c530ad
progress more 36+
Browse files
app.py
CHANGED
|
@@ -32,7 +32,6 @@ def create_analysis_data(df):
|
|
| 32 |
analysis_data.append([row['Объект'], 'РИСК УБЫТКА', row['Заголовок'], row['Выдержки из текста']])
|
| 33 |
return pd.DataFrame(analysis_data, columns=['Объект', 'Тип риска', 'Заголовок', 'Текст'])
|
| 34 |
|
| 35 |
-
|
| 36 |
# Function for lemmatizing Russian text
|
| 37 |
def lemmatize_text(text):
|
| 38 |
words = text.split()
|
|
@@ -116,9 +115,14 @@ def fuzzy_deduplicate(df, column, threshold=65):
|
|
| 116 |
|
| 117 |
|
| 118 |
def process_file(uploaded_file):
|
| 119 |
-
|
| 120 |
df = pd.read_excel(uploaded_file, sheet_name='Публикации')
|
| 121 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 122 |
original_news_count = len(df)
|
| 123 |
|
| 124 |
# Apply fuzzy deduplication
|
|
@@ -126,13 +130,11 @@ def process_file(uploaded_file):
|
|
| 126 |
lambda x: fuzzy_deduplicate(x, 'Выдержки из текста', 65)
|
| 127 |
).reset_index(drop=True)
|
| 128 |
|
| 129 |
-
|
| 130 |
remaining_news_count = len(df)
|
| 131 |
duplicates_removed = original_news_count - remaining_news_count
|
| 132 |
|
| 133 |
st.write(f"Из {original_news_count} новостных сообщений удалены {duplicates_removed} дублирующих. Осталось {remaining_news_count}.")
|
| 134 |
|
| 135 |
-
|
| 136 |
# Translate texts
|
| 137 |
translated_texts = []
|
| 138 |
lemmatized_texts = []
|
|
@@ -152,25 +154,24 @@ def process_file(uploaded_file):
|
|
| 152 |
progress_text.text(f"{i + 1} из {total_news} сообщений предобработано")
|
| 153 |
|
| 154 |
# Perform sentiment analysis
|
| 155 |
-
#rubert1_results = [get_rubert1_sentiment(text) for text in texts]
|
| 156 |
rubert2_results = [get_rubert2_sentiment(text) for text in texts]
|
| 157 |
finbert_results = [get_finbert_sentiment(text) for text in translated_texts]
|
| 158 |
roberta_results = [get_roberta_sentiment(text) for text in translated_texts]
|
| 159 |
finbert_tone_results = [get_finbert_tone_sentiment(text) for text in translated_texts]
|
| 160 |
|
| 161 |
-
#
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
|
| 172 |
|
| 173 |
-
return
|
| 174 |
|
| 175 |
def create_output_file(df, uploaded_file, analysis_df):
|
| 176 |
# Create a new workbook
|
|
@@ -238,7 +239,7 @@ def create_output_file(df, uploaded_file, analysis_df):
|
|
| 238 |
return output
|
| 239 |
|
| 240 |
def main():
|
| 241 |
-
st.title("... приступим к анализу... версия
|
| 242 |
|
| 243 |
uploaded_file = st.file_uploader("Выбирайте Excel-файл", type="xlsx")
|
| 244 |
|
|
|
|
| 32 |
analysis_data.append([row['Объект'], 'РИСК УБЫТКА', row['Заголовок'], row['Выдержки из текста']])
|
| 33 |
return pd.DataFrame(analysis_data, columns=['Объект', 'Тип риска', 'Заголовок', 'Текст'])
|
| 34 |
|
|
|
|
| 35 |
# Function for lemmatizing Russian text
|
| 36 |
def lemmatize_text(text):
|
| 37 |
words = text.split()
|
|
|
|
| 115 |
|
| 116 |
|
| 117 |
def process_file(uploaded_file):
|
|
|
|
| 118 |
df = pd.read_excel(uploaded_file, sheet_name='Публикации')
|
| 119 |
|
| 120 |
+
required_columns = ['Объект', 'Заголовок', 'Выдержки из текста']
|
| 121 |
+
missing_columns = [col for col in required_columns if col not in df.columns]
|
| 122 |
+
if missing_columns:
|
| 123 |
+
st.error(f"Error: The following required columns are missing from the input file: {', '.join(missing_columns)}")
|
| 124 |
+
st.stop()
|
| 125 |
+
|
| 126 |
original_news_count = len(df)
|
| 127 |
|
| 128 |
# Apply fuzzy deduplication
|
|
|
|
| 130 |
lambda x: fuzzy_deduplicate(x, 'Выдержки из текста', 65)
|
| 131 |
).reset_index(drop=True)
|
| 132 |
|
|
|
|
| 133 |
remaining_news_count = len(df)
|
| 134 |
duplicates_removed = original_news_count - remaining_news_count
|
| 135 |
|
| 136 |
st.write(f"Из {original_news_count} новостных сообщений удалены {duplicates_removed} дублирующих. Осталось {remaining_news_count}.")
|
| 137 |
|
|
|
|
| 138 |
# Translate texts
|
| 139 |
translated_texts = []
|
| 140 |
lemmatized_texts = []
|
|
|
|
| 154 |
progress_text.text(f"{i + 1} из {total_news} сообщений предобработано")
|
| 155 |
|
| 156 |
# Perform sentiment analysis
|
|
|
|
| 157 |
rubert2_results = [get_rubert2_sentiment(text) for text in texts]
|
| 158 |
finbert_results = [get_finbert_sentiment(text) for text in translated_texts]
|
| 159 |
roberta_results = [get_roberta_sentiment(text) for text in translated_texts]
|
| 160 |
finbert_tone_results = [get_finbert_tone_sentiment(text) for text in translated_texts]
|
| 161 |
|
| 162 |
+
# Create a new DataFrame with processed data
|
| 163 |
+
processed_df = pd.DataFrame({
|
| 164 |
+
'Объект': df['Объект'],
|
| 165 |
+
'Заголовок': df['Заголовок'], # Preserve original 'Заголовок'
|
| 166 |
+
'ruBERT2': rubert2_results,
|
| 167 |
+
'FinBERT': finbert_results,
|
| 168 |
+
'RoBERTa': roberta_results,
|
| 169 |
+
'FinBERT-Tone': finbert_tone_results,
|
| 170 |
+
'Выдержки из текста': df['Выдержки из текста'],
|
| 171 |
+
'Translated': translated_texts
|
| 172 |
+
})
|
| 173 |
|
| 174 |
+
return processed_df
|
| 175 |
|
| 176 |
def create_output_file(df, uploaded_file, analysis_df):
|
| 177 |
# Create a new workbook
|
|
|
|
| 239 |
return output
|
| 240 |
|
| 241 |
def main():
|
| 242 |
+
st.title("... приступим к анализу... версия 36+")
|
| 243 |
|
| 244 |
uploaded_file = st.file_uploader("Выбирайте Excel-файл", type="xlsx")
|
| 245 |
|