Spaces:

pentarosarium
/

gprocess

Build error

App Files Files Community

pentarosarium commited on Nov 20, 2024

Commit

4feef77

1 Parent(s): e20a82b

v.1.12

Browse files

Files changed (1) hide show

app.py +107 -76

app.py CHANGED Viewed

@@ -28,6 +28,19 @@ def fuzzy_deduplicate(df, column, threshold=55):
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 class EventDetector:
     def __init__(self):
         self.model_name = "google/mt5-small"
@@ -36,60 +49,16 @@ class EventDetector:
         self.finbert = None
         self.roberta = None
         self.finbert_tone = None
-    @spaces.GPU
-    def initialize_models(self):
-        try:
-            device = "cuda" if torch.cuda.is_available() else "cpu"
-            logger.info(f"Initializing models on device: {device}")
-            self.model = AutoModelForSeq2SeqLM.from_pretrained(self.model_name).to(device)
-            self.finbert = pipeline("sentiment-analysis", model="ProsusAI/finbert", device=device)
-            self.roberta = pipeline("sentiment-analysis", model="cardiffnlp/twitter-roberta-base-sentiment", device=device)
-            self.finbert_tone = pipeline("sentiment-analysis", model="yiyanghkust/finbert-tone", device=device)
-            return True
-        except Exception as e:
-            logger.error(f"Model initialization error: {str(e)}")
-            return False
-    @spaces.GPU
-    def detect_events(self, text, entity):
-        if not text or not entity:
-            return "Нет", "Invalid input"
-        try:
-            if self.model is None:
-                if not self.initialize_models():
-                    return "Нет", "Model initialization failed"
-            device = "cuda" if torch.cuda.is_available() else "cpu"
-            # Truncate input text to avoid tensor size mismatch
-            text = text[:500]  # Adjust this value if needed
-            prompt = f"""<s>Analyze the following news about {entity}:
-            Text: {text}
-            Task: Identify the main event type and provide a brief summary.</s>"""
-            inputs = self.tokenizer(prompt, return_tensors="pt", padding=True,
-                                  truncation=True, max_length=512).to(device)
-            outputs = self.model.generate(**inputs, max_length=300, num_return_sequences=1)
-            response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
-            event_type = "Нет"
-            if any(term in text.lower() for term in ['отчет', 'выручка', 'прибыль', 'ebitda']):
-                event_type = "Отчетность"
-            elif any(term in text.lower() for term in ['облигаци', 'купон', 'дефолт']):
-                event_type = "РЦБ"
-            elif any(term in text.lower() for term in ['суд', 'иск', 'арбитраж']):
-                event_type = "Суд"
-            return event_type, response
-        except Exception as e:
-            logger.error(f"Event detection error: {str(e)}")
-            return "Нет", f"Error: {str(e)}"
     @spaces.GPU
     def analyze_sentiment(self, text):
@@ -98,21 +67,19 @@ class EventDetector:
                 if not self.initialize_models():
                     return "Neutral"
-            # Truncate text to avoid tensor size issues
             truncated_text = text[:500]
             results = []
             try:
-                # Process text with all models in a batch
                 inputs = [truncated_text]
                 finbert_result = self.finbert(inputs, truncation=True, max_length=512)[0]
                 roberta_result = self.roberta(inputs, truncation=True, max_length=512)[0]
                 finbert_tone_result = self.finbert_tone(inputs, truncation=True, max_length=512)[0]
                 results = [
-                    self._get_sentiment(finbert_result),
-                    self._get_sentiment(roberta_result),
-                    self._get_sentiment(finbert_tone_result)
                 ]
             except Exception as e:
@@ -212,8 +179,10 @@ def process_file(file_obj):
         raise
 def create_interface():
     with gr.Blocks(theme=gr.themes.Soft()) as app:
-        gr.Markdown("# AI-анализ мониторинга новостей v.1.11")
         with gr.Row():
             file_input = gr.File(
@@ -223,10 +192,17 @@ def create_interface():
             )
         with gr.Row():
-            analyze_btn = gr.Button(
-                "Начать анализ",
-                variant="primary"
-            )
         with gr.Row():
             progress = gr.Textbox(
@@ -248,35 +224,89 @@ def create_interface():
             with gr.Column():
                 events_plot = gr.Plot(label="Распределение событий")
         def analyze(file_bytes):
             if file_bytes is None:
                 gr.Warning("Пожалуйста, загрузите файл")
                 return None, None, None, "Ожидание файла..."
             try:
-                # Create BytesIO object and debug print its content
                 file_obj = io.BytesIO(file_bytes)
                 logger.info("File loaded into BytesIO successfully")
-                # Process file with progress updates
                 progress_status = "Начинаем обработку файла..."
                 yield None, None, None, progress_status
-                df = process_file(file_obj)
-                if df.empty:
-                    return None, None, None, "Нет данных для обработки"
-                progress_status = f"Создание визуализаций..."
-                yield None, None, None, progress_status
-                fig_sentiment, fig_events = create_visualizations(df)
                 return (
-                    df,
-                    fig_sentiment,
-                    fig_events,
-                    f"Обработка завершена успешно! Обработано {len(df)} строк"
                 )
             except Exception as e:
@@ -285,6 +315,7 @@ def create_interface():
                 gr.Error(error_msg)
                 return None, None, None, error_msg
         analyze_btn.click(
             fn=analyze,
             inputs=[file_input],

 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
+class ProcessControl:
+    def __init__(self):
+        self.stop_requested = False
+    def request_stop(self):
+        self.stop_requested = True
+    def should_stop(self):
+        return self.stop_requested
+    def reset(self):
+        self.stop_requested = False
 class EventDetector:
     def __init__(self):
         self.model_name = "google/mt5-small"
         self.finbert = None
         self.roberta = None
         self.finbert_tone = None
+        self.control = ProcessControl()
+    def get_sentiment_label(self, result):
+        """Helper method for sentiment classification"""
+        label = result['label'].lower()
+        if label in ["positive", "label_2", "pos"]:
+            return "Positive"
+        elif label in ["negative", "label_0", "neg"]:
+            return "Negative"
+        return "Neutral"
     @spaces.GPU
     def analyze_sentiment(self, text):
                 if not self.initialize_models():
                     return "Neutral"
             truncated_text = text[:500]
             results = []
             try:
                 inputs = [truncated_text]
                 finbert_result = self.finbert(inputs, truncation=True, max_length=512)[0]
                 roberta_result = self.roberta(inputs, truncation=True, max_length=512)[0]
                 finbert_tone_result = self.finbert_tone(inputs, truncation=True, max_length=512)[0]
                 results = [
+                    self.get_sentiment_label(finbert_result),
+                    self.get_sentiment_label(roberta_result),
+                    self.get_sentiment_label(finbert_tone_result)
                 ]
             except Exception as e:
         raise
 def create_interface():
+    control = ProcessControl()
     with gr.Blocks(theme=gr.themes.Soft()) as app:
+        gr.Markdown("# AI-анализ мониторинга новостей v.1.12")
         with gr.Row():
             file_input = gr.File(
             )
         with gr.Row():
+            col1, col2 = gr.Columns(2)
+            with col1:
+                analyze_btn = gr.Button(
+                    "Начать анализ",
+                    variant="primary"
+                )
+            with col2:
+                stop_btn = gr.Button(
+                    "❌ Остановить",
+                    variant="stop"
+                )
         with gr.Row():
             progress = gr.Textbox(
             with gr.Column():
                 events_plot = gr.Plot(label="Распределение событий")
+        def stop_processing():
+            control.request_stop()
+            return "Остановка обработки..."
         def analyze(file_bytes):
             if file_bytes is None:
                 gr.Warning("Пожалуйста, загрузите файл")
                 return None, None, None, "Ожидание файла..."
             try:
+                # Reset stop flag
+                control.reset()
                 file_obj = io.BytesIO(file_bytes)
                 logger.info("File loaded into BytesIO successfully")
                 progress_status = "Начинаем обработку файла..."
                 yield None, None, None, progress_status
+                # Process file
+                df = pd.read_excel(file_obj, sheet_name='Публикации')
+                logger.info(f"Successfully read Excel file. Shape: {df.shape}")
+                # Deduplication
+                original_count = len(df)
+                df = fuzzy_deduplicate(df, 'Выдержки из текста', threshold=55)
+                logger.info(f"Removed {original_count - len(df)} duplicate entries")
+                detector = EventDetector()
+                detector.control = control  # Pass control object
+                processed_rows = []
+                total = len(df)
+                # Initialize models
+                if not detector.initialize_models():
+                    raise Exception("Failed to initialize models")
+                for idx, row in df.iterrows():
+                    if control.should_stop():
+                        yield (
+                            pd.DataFrame(processed_rows) if processed_rows else None,
+                            None, None,
+                            f"Обработка остановлена. Обработано {idx} из {total} строк"
+                        )
+                        return
+                    try:
+                        text = str(row.get('Выдержки из текста', ''))
+                        if not text.strip():
+                            continue
+                        entity = str(row.get('Объект', ''))
+                        if not entity.strip():
+                            continue
+                        event_type, event_summary = detector.detect_events(text, entity)
+                        sentiment = detector.analyze_sentiment(text)
+                        processed_rows.append({
+                            'Объект': entity,
+                            'Заголовок': str(row.get('Заголовок', '')),
+                            'Sentiment': sentiment,
+                            'Event_Type': event_type,
+                            'Event_Summary': event_summary,
+                            'Текст': text[:1000]
+                        })
+                        if idx % 5 == 0:
+                            progress_status = f"Обработано {idx + 1}/{total} строк"
+                            yield None, None, None, progress_status
+                    except Exception as e:
+                        logger.error(f"Error processing row {idx}: {str(e)}")
+                        continue
+                result_df = pd.DataFrame(processed_rows)
+                fig_sentiment, fig_events = create_visualizations(result_df)
                 return (
+                    result_df,
+                    fig_sentiment,
+                    fig_events,
+                    f"Обработка завершена успешно! Обработано {len(result_df)} строк"
                 )
             except Exception as e:
                 gr.Error(error_msg)
                 return None, None, None, error_msg
+        stop_btn.click(fn=stop_processing, outputs=[progress])
         analyze_btn.click(
             fn=analyze,
             inputs=[file_input],