Spaces:

MHamdan
/

ContentAnalyzer

Sleeping

App Files Files Community

MHamdan commited on Feb 15, 2025

Commit

523e9ce

verified ·

1 Parent(s): 2af5feb

update app

Browse files

Files changed (1) hide show

app.py +81 -73

app.py CHANGED Viewed

@@ -1,5 +1,3 @@
-# app.py
 import gradio as gr
 import requests
 from bs4 import BeautifulSoup
@@ -7,96 +5,62 @@ from transformers import pipeline
 import PyPDF2
 import docx
 import os
-import time
-from typing import List, Tuple, Optional
 class ContentAnalyzer:
     def __init__(self):
-        print("[DEBUG] Initializing pipelines...")
         self.summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
         self.sentiment_analyzer = pipeline("sentiment-analysis")
         self.zero_shot = pipeline("zero-shot-classification")
-        print("[DEBUG] Pipelines initialized.")
     def read_file(self, file_obj) -> str:
         """Read content from different file types."""
         if file_obj is None:
-            print("[DEBUG] No file uploaded.")
             return ""
         file_ext = os.path.splitext(file_obj.name)[1].lower()
-        print(f"[DEBUG] Uploaded file extension detected: {file_ext}")
         try:
             if file_ext == '.txt':
-                content = file_obj.read().decode('utf-8')
-                print("[DEBUG] Successfully read .txt file.")
-                return content
             elif file_ext == '.pdf':
-                # Note: For PyPDF2 >= 3.0.0, this usage is valid
                 pdf_reader = PyPDF2.PdfReader(file_obj)
                 text = ""
                 for page in pdf_reader.pages:
                     text += page.extract_text() + "\n"
-                print("[DEBUG] Successfully read .pdf file.")
                 return text
             elif file_ext == '.docx':
                 doc = docx.Document(file_obj)
-                paragraphs = [paragraph.text for paragraph in doc.paragraphs]
-                print("[DEBUG] Successfully read .docx file.")
-                return "\n".join(paragraphs)
             else:
-                msg = f"Unsupported file type: {file_ext}"
-                print("[DEBUG]", msg)
-                return msg
         except Exception as e:
-            error_msg = f"Error reading file: {str(e)}"
-            print("[DEBUG]", error_msg)
-            return error_msg
     def fetch_web_content(self, url: str) -> str:
         """Fetch content from URL."""
-        print(f"[DEBUG] Attempting to fetch URL: {url}")
         try:
             response = requests.get(url, timeout=10)
             response.raise_for_status()
             soup = BeautifulSoup(response.text, 'html.parser')
-            # Remove scripts and styles
             for script in soup(["script", "style"]):
                 script.decompose()
             text = soup.get_text(separator='\n')
             lines = (line.strip() for line in text.splitlines())
-            final_text = "\n".join(line for line in lines if line)
-            print("[DEBUG] Successfully fetched and cleaned web content.")
-            return final_text
         except Exception as e:
-            error_msg = f"Error fetching URL: {str(e)}"
-            print("[DEBUG]", error_msg)
-            return error_msg
     def analyze_content(
-        self,
         text: Optional[str] = None,
         url: Optional[str] = None,
         file: Optional[object] = None,
         analysis_types: List[str] = ["summarize"],
         progress_callback=None
     ) -> dict:
-        """
-        Analyze content from text, URL, or file.
-        progress_callback is a function for updating progress steps.
-        """
         try:
-            # Step 1: Retrieve content
             if progress_callback:
-                progress_callback(1, "Reading input...")
             if url:
                 content = self.fetch_web_content(url)
@@ -108,31 +72,30 @@ class ContentAnalyzer:
             if not content or content.startswith("Error"):
                 return {"error": content or "No content provided"}
-            # Truncate for debug
             truncated = content[:1000] + "..." if len(content) > 1000 else content
             results = {"original_text": truncated}
-            # Step 2: Summarize
             if "summarize" in analysis_types:
                 if progress_callback:
-                    progress_callback(2, "Summarizing content...")
                 summary = self.summarizer(content[:1024], max_length=130, min_length=30)
                 results["summary"] = summary[0]['summary_text']
-            # Step 3: Sentiment
             if "sentiment" in analysis_types:
                 if progress_callback:
-                    progress_callback(3, "Performing sentiment analysis...")
                 sentiment = self.sentiment_analyzer(content[:512])
                 results["sentiment"] = {
                     "label": sentiment[0]['label'],
                     "score": round(sentiment[0]['score'], 3)
                 }
-            # Step 4: Topics
             if "topics" in analysis_types:
                 if progress_callback:
-                    progress_callback(4, "Identifying topics...")
                 topics = self.zero_shot(
                     content[:512],
                     candidate_labels=[
@@ -149,9 +112,8 @@ class ContentAnalyzer:
             return results
         except Exception as e:
-            error_msg = f"Analysis error: {str(e)}"
-            print("[DEBUG]", error_msg)
-            return {"error": error_msg}
 def create_interface():
     analyzer = ContentAnalyzer()
@@ -160,12 +122,47 @@ def create_interface():
         gr.Markdown("# 📑 Content Analyzer")
         gr.Markdown("Analyze text content from various sources using AI.")
-        with gr.Tabs():
-            # Tabs for Text Input, Web URL, File Upload...
-            text_input = gr.Textbox(label="Enter Text", placeholder="Paste your text here...", lines=5)
-            url_input = gr.Textbox(label="Enter URL", placeholder="https://example.com")
-            file_input = gr.File(label="Upload File", file_types=[".txt", ".pdf", ".docx"])
         analysis_types = gr.CheckboxGroup(
             choices=["summarize", "sentiment", "topics"],
             value=["summarize"],
@@ -174,6 +171,7 @@ def create_interface():
         analyze_btn = gr.Button("Analyze", variant="primary")
         with gr.Tabs():
             with gr.Tab("Original Text"):
                 original_text = gr.Markdown()
@@ -184,22 +182,32 @@ def create_interface():
             with gr.Tab("Topics"):
                 topics_output = gr.Markdown()
-        def process_analysis(text, url, file, types, progress=gr.Progress()):
             steps_total = 4
             def progress_callback(step: int, desc: str):
-                """
-                step: integer step index (1 to steps_total)
-                desc: a short description of the current step
-                """
-                # Pass the integer 'step' as iteration, and the string 'desc' as desc.
                 progress(step, total=steps_total, desc=desc)
-            # Call your analyzer
             results = analyzer.analyze_content(
-                text=text,
-                url=url,
-                file=file,
                 analysis_types=types,
                 progress_callback=progress_callback
             )
@@ -224,7 +232,7 @@ def create_interface():
         analyze_btn.click(
             fn=process_analysis,
-            inputs=[text_input, url_input, file_input, analysis_types],
             outputs=[original_text, summary_output, sentiment_output, topics_output],
             show_progress=True
         )
@@ -233,4 +241,4 @@ def create_interface():
 if __name__ == "__main__":
     demo = create_interface()
-    demo.launch()

 import gradio as gr
 import requests
 from bs4 import BeautifulSoup
 import PyPDF2
 import docx
 import os
+from typing import List, Optional
 class ContentAnalyzer:
     def __init__(self):
         self.summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
         self.sentiment_analyzer = pipeline("sentiment-analysis")
         self.zero_shot = pipeline("zero-shot-classification")
     def read_file(self, file_obj) -> str:
         """Read content from different file types."""
         if file_obj is None:
             return ""
         file_ext = os.path.splitext(file_obj.name)[1].lower()
         try:
             if file_ext == '.txt':
+                return file_obj.read().decode('utf-8')
             elif file_ext == '.pdf':
                 pdf_reader = PyPDF2.PdfReader(file_obj)
                 text = ""
                 for page in pdf_reader.pages:
                     text += page.extract_text() + "\n"
                 return text
             elif file_ext == '.docx':
                 doc = docx.Document(file_obj)
+                return "\n".join([paragraph.text for paragraph in doc.paragraphs])
             else:
+                return f"Unsupported file type: {file_ext}"
         except Exception as e:
+            return f"Error reading file: {str(e)}"
     def fetch_web_content(self, url: str) -> str:
         """Fetch content from URL."""
         try:
             response = requests.get(url, timeout=10)
             response.raise_for_status()
             soup = BeautifulSoup(response.text, 'html.parser')
             for script in soup(["script", "style"]):
                 script.decompose()
             text = soup.get_text(separator='\n')
             lines = (line.strip() for line in text.splitlines())
+            return "\n".join(line for line in lines if line)
         except Exception as e:
+            return f"Error fetching URL: {str(e)}"
     def analyze_content(
+        self,
         text: Optional[str] = None,
         url: Optional[str] = None,
         file: Optional[object] = None,
         analysis_types: List[str] = ["summarize"],
         progress_callback=None
     ) -> dict:
         try:
+            # STEP 1: Retrieve content
             if progress_callback:
+                progress_callback(1, "Reading input")
             if url:
                 content = self.fetch_web_content(url)
             if not content or content.startswith("Error"):
                 return {"error": content or "No content provided"}
             truncated = content[:1000] + "..." if len(content) > 1000 else content
             results = {"original_text": truncated}
+            # STEP 2: Summarize
             if "summarize" in analysis_types:
                 if progress_callback:
+                    progress_callback(2, "Summarizing content")
                 summary = self.summarizer(content[:1024], max_length=130, min_length=30)
                 results["summary"] = summary[0]['summary_text']
+            # STEP 3: Sentiment
             if "sentiment" in analysis_types:
                 if progress_callback:
+                    progress_callback(3, "Performing sentiment analysis")
                 sentiment = self.sentiment_analyzer(content[:512])
                 results["sentiment"] = {
                     "label": sentiment[0]['label'],
                     "score": round(sentiment[0]['score'], 3)
                 }
+            # STEP 4: Topics
             if "topics" in analysis_types:
                 if progress_callback:
+                    progress_callback(4, "Identifying topics")
                 topics = self.zero_shot(
                     content[:512],
                     candidate_labels=[
             return results
         except Exception as e:
+            return {"error": f"Analysis error: {str(e)}"}
 def create_interface():
     analyzer = ContentAnalyzer()
         gr.Markdown("# 📑 Content Analyzer")
         gr.Markdown("Analyze text content from various sources using AI.")
+        # Dropdown to choose input type
+        input_choice = gr.Dropdown(
+            choices=["Text", "URL", "File"],
+            value="Text",
+            label="Select Input Type"
+        )
+        # Containers for each input type
+        with gr.Column(visible=True) as text_col:
+            text_input = gr.Textbox(
+                label="Enter Text",
+                placeholder="Paste your text here...",
+                lines=5
+            )
+        with gr.Column(visible=False) as url_col:
+            url_input = gr.Textbox(
+                label="Enter URL",
+                placeholder="https://example.com"
+            )
+        with gr.Column(visible=False) as file_col:
+            file_input = gr.File(
+                label="Upload File",
+                file_types=[".txt", ".pdf", ".docx"]
+            )
+        # Callback function to show/hide input columns
+        def show_inputs(choice):
+            return {
+                text_col: choice == "Text",
+                url_col: choice == "URL",
+                file_col: choice == "File"
+            }
+        # Trigger showing/hiding based on the dropdown choice
+        input_choice.change(
+            fn=show_inputs,
+            inputs=[input_choice],
+            outputs=[text_col, url_col, file_col]
+        )
+        # Analysis Options
         analysis_types = gr.CheckboxGroup(
             choices=["summarize", "sentiment", "topics"],
             value=["summarize"],
         analyze_btn = gr.Button("Analyze", variant="primary")
+        # Output Sections in tabs
         with gr.Tabs():
             with gr.Tab("Original Text"):
                 original_text = gr.Markdown()
             with gr.Tab("Topics"):
                 topics_output = gr.Markdown()
+        def process_analysis(choice, text, url, file, types, progress=gr.Progress()):
+            """Orchestrates analysis depending on input choice."""
             steps_total = 4
             def progress_callback(step: int, desc: str):
                 progress(step, total=steps_total, desc=desc)
+            # Determine which content to pass based on the input choice
+            if choice == "Text":
+                content_text = text
+                content_url = None
+                content_file = None
+            elif choice == "URL":
+                content_text = None
+                content_url = url
+                content_file = None
+            else:  # choice == "File"
+                content_text = None
+                content_url = None
+                content_file = file
+            # Perform analysis
             results = analyzer.analyze_content(
+                text=content_text,
+                url=content_url,
+                file=content_file,
                 analysis_types=types,
                 progress_callback=progress_callback
             )
         analyze_btn.click(
             fn=process_analysis,
+            inputs=[input_choice, text_input, url_input, file_input, analysis_types],
             outputs=[original_text, summary_output, sentiment_output, topics_output],
             show_progress=True
         )
 if __name__ == "__main__":
     demo = create_interface()
+    demo.launch()