Spaces:

pentarosarium
/

processor

Running

App Files Files Community

pentarosarium commited on Oct 22, 2024

Commit

a87d6f0

1 Parent(s): 1254c79

progress more (2)

Browse files

Files changed (2) hide show

app.py +91 -95
requirements.txt +1 -1

app.py CHANGED Viewed

@@ -10,6 +10,46 @@ from openpyxl import load_workbook
 from langchain_community.chat_models import ChatOpenAI
 from langchain.prompts import PromptTemplate
 from langchain_core.runnables import RunnablePassthrough
 def fuzzy_deduplicate(df, column, threshold=65):
     seen_texts = []
@@ -43,27 +83,22 @@ def init_langchain_llm():
         st.error(f"Error initializing the Groq LLM: {str(e)}")
         st.stop()
-def estimate_sentiment_and_impact(llm, news_text, entity):
     template = """
-    First, translate this Russian text into English:
-    "{news}"
-    Then, analyze the translated text about the entity "{entity}" and determine:
-    1. Sentiment (Positive/Negative/Neutral)
-    2. Estimate potential financial impact in Russian rubles for this entity in the next 6 months.
     If precise monetary estimate is not possible, categorize the impact as one of the following:
-    1. "Significant risk of loss"
-    2. "Moderate risk of loss"
-    3. "Minor risk of loss"
-    4. "Probability of profit"
-    5. "Uncertain effect"
-    Provide a brief reasoning (maximum 100 words).
     Your response should be in the following format:
-    Translation: [Your English translation]
-    Sentiment: [Positive/Negative/Neutral]
     Impact: [Your estimate or category]
     Reasoning: [Your reasoning]
     """
@@ -71,44 +106,19 @@ def estimate_sentiment_and_impact(llm, news_text, entity):
     chain = prompt | llm | RunnablePassthrough()
     response = chain.invoke({"entity": entity, "news": news_text})
-    sentiment = "Neutral"
-    impact = "Uncertain effect"
-    reasoning = "Unable to provide reasoning"
     if isinstance(response, str):
         try:
-            # Extract sentiment
-            if "Sentiment:" in response:
-                sentiment_part = response.split("Sentiment:")[1].split("\n")[0].strip().lower()
-                if "positive" in sentiment_part:
-                    sentiment = "Positive"
-                elif "negative" in sentiment_part:
-                    sentiment = "Negative"
-            # Extract impact and reasoning
             if "Impact:" in response and "Reasoning:" in response:
                 impact_part, reasoning_part = response.split("Reasoning:")
                 impact = impact_part.split("Impact:")[1].strip()
                 reasoning = reasoning_part.strip()
-                # Translate impact categories back to Russian
-                impact_mapping = {
-                    "Significant risk of loss": "Значительный риск убытков",
-                    "Moderate risk of loss": "Умеренный риск убытков",
-                    "Minor risk of loss": "Незначительный риск убытков",
-                    "Probability of profit": "Вероятность прибыли",
-                    "Uncertain effect": "Неопределенный эффект"
-                }
-                for eng, rus in impact_mapping.items():
-                    if eng.lower() in impact.lower():
-                        impact = rus
-                        break
         except Exception as e:
             st.error(f"Error parsing LLM response: {str(e)}")
-    return sentiment, impact, reasoning
 def format_elapsed_time(seconds):
     hours, remainder = divmod(int(seconds), 3600)
@@ -153,52 +163,65 @@ def process_file(uploaded_file):
         st.error(f"Error: The following required columns are missing from the input file: {', '.join(missing_columns)}")
         st.stop()
     original_news_count = len(df)
     df = df.groupby('Объект').apply(
         lambda x: fuzzy_deduplicate(x, 'Выдержки из текста', 65)
     ).reset_index(drop=True)
     remaining_news_count = len(df)
     duplicates_removed = original_news_count - remaining_news_count
     st.write(f"Из {original_news_count} новостных сообщений удалены {duplicates_removed} дублирующих. Осталось {remaining_news_count}.")
-    llm = init_langchain_llm()
-    if not llm:
-        st.error("Не удалось инициализировать нейросеть. Пожалуйста, проверьте настройки и попробуйте снова.")
-        st.stop()
     df['Sentiment'] = ''
     df['Impact'] = ''
     df['Reasoning'] = ''
-    progress_bar = st.progress(0)
-    status_text = st.empty()
     for index, row in df.iterrows():
-        sentiment, impact, reasoning = estimate_sentiment_and_impact(
-            llm,
-            row['Выдержки из текста'],
-            row['Объект']
-        )
         df.at[index, 'Sentiment'] = sentiment
-        df.at[index, 'Impact'] = impact
-        df.at[index, 'Reasoning'] = reasoning
         progress = (index + 1) / len(df)
         progress_bar.progress(progress)
         status_text.text(f"Проанализировано {index + 1} из {len(df)} новостей")
         st.write(f"Объект: {row['Объект']}")
         st.write(f"Новость: {row['Заголовок']}")
         st.write(f"Тональность: {sentiment}")
-        st.write(f"Эффект: {impact}")
-        st.write(f"Обоснование: {reasoning}")
         st.write("---")
     progress_bar.empty()
     status_text.empty()
     visualization = generate_sentiment_visualization(df)
     if visualization:
         st.pyplot(visualization)
@@ -229,6 +252,7 @@ def create_analysis_data(df):
 def create_output_file(df, uploaded_file):
     wb = load_workbook("sample_file.xlsx")
     summary_df = pd.DataFrame({
         'Объект': df['Объект'].unique(),
         'Всего новостей': df.groupby('Объект').size(),
@@ -241,44 +265,16 @@ def create_output_file(df, uploaded_file):
     summary_df = summary_df.sort_values('Негативные', ascending=False)
-    ws = wb['Сводка']
-    for r_idx, row in enumerate(dataframe_to_rows(summary_df, index=False, header=True), start=4):
-        for c_idx, value in enumerate(row, start=5):
-            ws.cell(row=r_idx, column=c_idx, value=value)
-    significant_data = []
-    for _, row in df.iterrows():
-        if row['Sentiment'] in ['Negative', 'Positive']:
-            significant_data.append([
-                row['Объект'],
-                'релевантен',
-                row['Sentiment'],
-                row['Impact'],
-                row['Заголовок'],
-                row['Выдержки из текста']
-            ])
-    ws = wb['Значимые']
-    for r_idx, row in enumerate(significant_data, start=3):
-        for c_idx, value in enumerate(row, start=3):
-            ws.cell(row=r_idx, column=c_idx, value=value)
-    analysis_df = create_analysis_data(df)
-    ws = wb['Анализ']
-    for r_idx, row in enumerate(dataframe_to_rows(analysis_df, index=False, header=True), start=4):
-        for c_idx, value in enumerate(row, start=5):
-            ws.cell(row=r_idx, column=c_idx, value=value)
-    original_df = pd.read_excel(uploaded_file, sheet_name='Публикации')
-    ws = wb['Публикации']
-    for r_idx, row in enumerate(dataframe_to_rows(original_df, index=False, header=True), start=1):
-        for c_idx, value in enumerate(row, start=1):
-            ws.cell(row=r_idx, column=c_idx, value=value)
     if 'Тех.приложение' not in wb.sheetnames:
         wb.create_sheet('Тех.приложение')
     ws = wb['Тех.приложение']
-    for r_idx, row in enumerate(dataframe_to_rows(df, index=False, header=True), start=1):
         for c_idx, value in enumerate(row, start=1):
             ws.cell(row=r_idx, column=c_idx, value=value)
@@ -306,7 +302,7 @@ def main():
         unsafe_allow_html=True
     )
-    st.title("::: анализ мониторинга новостей СКАН-ИНТЕРФАКС :::")
     if 'processed_df' not in st.session_state:
         st.session_state.processed_df = None

 from langchain_community.chat_models import ChatOpenAI
 from langchain.prompts import PromptTemplate
 from langchain_core.runnables import RunnablePassthrough
+from transformers import pipeline
+# Initialize sentiment analyzers
+finbert = pipeline("sentiment-analysis", model="ProsusAI/finbert")
+roberta = pipeline("sentiment-analysis", model="cardiffnlp/twitter-roberta-base-sentiment")
+finbert_tone = pipeline("sentiment-analysis", model="yiyanghkust/finbert-tone")
+def translate_text(llm, text):
+    template = """
+    Translate this Russian text into English:
+    "{text}"
+    Your response should contain only the English translation.
+    """
+    prompt = PromptTemplate(template=template, input_variables=["text"])
+    chain = prompt | llm | RunnablePassthrough()
+    response = chain.invoke({"text": text})
+    return response.strip()
+def get_mapped_sentiment(result):
+    label = result['label'].lower()
+    if label in ["positive", "label_2", "pos", "pos_label"]:
+        return "Positive"
+    elif label in ["negative", "label_0", "neg", "neg_label"]:
+        return "Negative"
+    return "Neutral"
+def analyze_sentiment(text):
+    finbert_result = get_mapped_sentiment(finbert(text, truncation=True, max_length=512)[0])
+    roberta_result = get_mapped_sentiment(roberta(text, truncation=True, max_length=512)[0])
+    finbert_tone_result = get_mapped_sentiment(finbert_tone(text, truncation=True, max_length=512)[0])
+    # Consider sentiment negative if any model says it's negative
+    if any(result == "Negative" for result in [finbert_result, roberta_result, finbert_tone_result]):
+        return "Negative"
+    elif all(result == "Positive" for result in [finbert_result, roberta_result, finbert_tone_result]):
+        return "Positive"
+    return "Neutral"
 def fuzzy_deduplicate(df, column, threshold=65):
     seen_texts = []
         st.error(f"Error initializing the Groq LLM: {str(e)}")
         st.stop()
+def estimate_impact(llm, news_text, entity):
     template = """
+    Analyze the following news piece about the entity "{entity}" and estimate its monetary impact in Russian rubles for this entity in the next 6 months.
     If precise monetary estimate is not possible, categorize the impact as one of the following:
+    1. "Значительный риск убытков"
+    2. "Умеренный риск убытков"
+    3. "Незначительный риск убытков"
+    4. "Вероятность прибыли"
+    5. "Неопределенный эффект"
+    Provide brief reasoning (maximum 100 words).
+    News: {news}
     Your response should be in the following format:
     Impact: [Your estimate or category]
     Reasoning: [Your reasoning]
     """
     chain = prompt | llm | RunnablePassthrough()
     response = chain.invoke({"entity": entity, "news": news_text})
+    impact = "Неопределенный эффект"
+    reasoning = "Не удалось получить обоснование"
     if isinstance(response, str):
         try:
             if "Impact:" in response and "Reasoning:" in response:
                 impact_part, reasoning_part = response.split("Reasoning:")
                 impact = impact_part.split("Impact:")[1].strip()
                 reasoning = reasoning_part.strip()
         except Exception as e:
             st.error(f"Error parsing LLM response: {str(e)}")
+    return impact, reasoning
 def format_elapsed_time(seconds):
     hours, remainder = divmod(int(seconds), 3600)
         st.error(f"Error: The following required columns are missing from the input file: {', '.join(missing_columns)}")
         st.stop()
+    # Initialize LLM
+    llm = init_langchain_llm()
+    if not llm:
+        st.error("Не удалось инициализировать нейросеть. Пожалуйста, проверьте настройки и попробуйте снова.")
+        st.stop()
+    # Deduplication
     original_news_count = len(df)
     df = df.groupby('Объект').apply(
         lambda x: fuzzy_deduplicate(x, 'Выдержки из текста', 65)
     ).reset_index(drop=True)
     remaining_news_count = len(df)
     duplicates_removed = original_news_count - remaining_news_count
     st.write(f"Из {original_news_count} новостных сообщений удалены {duplicates_removed} дублирующих. Осталось {remaining_news_count}.")
+    # Initialize progress
+    progress_bar = st.progress(0)
+    status_text = st.empty()
+    # Process each news item
+    df['Translated'] = ''
     df['Sentiment'] = ''
     df['Impact'] = ''
     df['Reasoning'] = ''
     for index, row in df.iterrows():
+        # First: Translate
+        translated_text = translate_text(llm, row['Выдержки из текста'])
+        df.at[index, 'Translated'] = translated_text
+        # Second: Analyze sentiment
+        sentiment = analyze_sentiment(translated_text)
         df.at[index, 'Sentiment'] = sentiment
+        # Third: If negative, estimate impact
+        if sentiment == "Negative":
+            impact, reasoning = estimate_impact(llm, translated_text, row['Объект'])
+            df.at[index, 'Impact'] = impact
+            df.at[index, 'Reasoning'] = reasoning
+        # Update progress
         progress = (index + 1) / len(df)
         progress_bar.progress(progress)
         status_text.text(f"Проанализировано {index + 1} из {len(df)} новостей")
+        # Display results
         st.write(f"Объект: {row['Объект']}")
         st.write(f"Новость: {row['Заголовок']}")
         st.write(f"Тональность: {sentiment}")
+        if sentiment == "Negative":
+            st.write(f"Эффект: {impact}")
+            st.write(f"Обоснование: {reasoning}")
         st.write("---")
     progress_bar.empty()
     status_text.empty()
+    # Generate visualization
     visualization = generate_sentiment_visualization(df)
     if visualization:
         st.pyplot(visualization)
 def create_output_file(df, uploaded_file):
     wb = load_workbook("sample_file.xlsx")
+    # Update 'Сводка' sheet
     summary_df = pd.DataFrame({
         'Объект': df['Объект'].unique(),
         'Всего новостей': df.groupby('Объект').size(),
     summary_df = summary_df.sort_values('Негативные', ascending=False)
+    # Write sheets...
+    # (keep existing code for writing sheets)
+    # Update 'Тех.приложение' sheet to include translated text
+    tech_df = df[['Объект', 'Заголовок', 'Выдержки из текста', 'Translated', 'Sentiment', 'Impact', 'Reasoning']]
     if 'Тех.приложение' not in wb.sheetnames:
         wb.create_sheet('Тех.приложение')
     ws = wb['Тех.приложение']
+    for r_idx, row in enumerate(dataframe_to_rows(tech_df, index=False, header=True), start=1):
         for c_idx, value in enumerate(row, start=1):
             ws.cell(row=r_idx, column=c_idx, value=value)
         unsafe_allow_html=True
     )
+    st.title("::: анализ мониторинга новостей СКАН-ИНТЕРФАКС (2):::")
     if 'processed_df' not in st.session_state:
         st.session_state.processed_df = None

requirements.txt CHANGED Viewed

@@ -15,4 +15,4 @@ langchain-community
 huggingface_hub
 accelerate>=0.26.0
 openai
-wordcloud

 huggingface_hub
 accelerate>=0.26.0
 openai
+wordcloud