Spaces:

Mooo-osama03
/

Topicclassification

Sleeping

App Files Files Community

Mooo-osama03 commited on Oct 11, 2025

Commit

bd7a3b4

verified ·

1 Parent(s): 6d5ccf0

Update app.py

Browse files

Files changed (1) hide show

app.py +18 -6

app.py CHANGED Viewed

@@ -5,7 +5,12 @@ import pandas as pd
 import numpy as np
 from sklearn.cluster import KMeans
 from sentence_transformers import SentenceTransformer
 # ---------- Helper: extract text from PDF ----------
 def extract_text_from_pdf(pdf_path):
@@ -41,8 +46,15 @@ def transformer_topic_modeling(sentences, auto_topics=True, num_topics=5):
     for i in range(num_topics):
         topic_sentences = df[df["Topic"] == i]["Sentence"].tolist()
         joined_text = " ".join(topic_sentences)
-        words = re.findall(r"\b\w+\b", joined_text.lower())
-        top_words = pd.Series(words).value_counts().head(3).index.tolist()
         title = " & ".join(top_words).title()
         topics.append((title, " ".join(topic_sentences[:3])))
@@ -70,17 +82,17 @@ def analyze_input(pdf_file, essay_text):
         topic_data, num_topics = transformer_topic_modeling(sentences, auto_topics=True)
         print("✅ Topics discovered:", num_topics)
-        # Build Markdown output (Gradio-safe)
         output_lines = [f"✅ **Detected {num_topics} Topics:**\n"]
         for i, (title, examples) in enumerate(topic_data, 1):
             output_lines.append(f"**Topic {i}: {title}**\n{examples}\n")
         result = "\n\n".join(output_lines)
-        return result  # ✅ Must return a string
     except Exception as e:
         import traceback
-        print(traceback.format_exc())  # full error log for Hugging Face
         return f"⚠️ Error: {str(e)}"
@@ -92,7 +104,7 @@ demo = gr.Interface(
         gr.Textbox(label="📝 Essay Text", lines=7, placeholder="Write or paste your essay here...")
     ],
     outputs=gr.Markdown(label="🧠 Topic Analysis Result"),
-    title="Topic Modeling App",
     description="Upload a PDF and/or write an essay. The system identifies and summarizes main topics using transformer embeddings."
 )

 import numpy as np
 from sklearn.cluster import KMeans
 from sentence_transformers import SentenceTransformer
+import nltk
+from nltk.corpus import stopwords
+# ---------- Setup ----------
+nltk.download('stopwords', quiet=True)
+stop_words = set(stopwords.words('english'))
 # ---------- Helper: extract text from PDF ----------
 def extract_text_from_pdf(pdf_path):
     for i in range(num_topics):
         topic_sentences = df[df["Topic"] == i]["Sentence"].tolist()
         joined_text = " ".join(topic_sentences)
+        # --- Extract keywords excluding stopwords ---
+        words = re.findall(r"\b[a-z]{3,}\b", joined_text.lower())
+        filtered = [w for w in words if w not in stop_words]
+        if filtered:
+            top_words = pd.Series(filtered).value_counts().head(3).index.tolist()
+        else:
+            top_words = ["General"]
         title = " & ".join(top_words).title()
         topics.append((title, " ".join(topic_sentences[:3])))
         topic_data, num_topics = transformer_topic_modeling(sentences, auto_topics=True)
         print("✅ Topics discovered:", num_topics)
+        # Build Markdown output for Gradio
         output_lines = [f"✅ **Detected {num_topics} Topics:**\n"]
         for i, (title, examples) in enumerate(topic_data, 1):
             output_lines.append(f"**Topic {i}: {title}**\n{examples}\n")
         result = "\n\n".join(output_lines)
+        return result  # ✅ Return string only
     except Exception as e:
         import traceback
+        print(traceback.format_exc())  # full log in Hugging Face console
         return f"⚠️ Error: {str(e)}"
         gr.Textbox(label="📝 Essay Text", lines=7, placeholder="Write or paste your essay here...")
     ],
     outputs=gr.Markdown(label="🧠 Topic Analysis Result"),
+    title="Topic Modeling App (PDF + Essay)",
     description="Upload a PDF and/or write an essay. The system identifies and summarizes main topics using transformer embeddings."
 )