Spaces:

Mooo-osama03
/

Topicclassification

Sleeping

App Files Files Community

Mooo-osama03 commited on Oct 11, 2025

Commit

6d5ccf0

verified ·

1 Parent(s): b94af5b

Update app.py

Browse files

Files changed (1) hide show

app.py +100 -123

app.py CHANGED Viewed

@@ -1,123 +1,100 @@
-import re
-import fitz  # PyMuPDF
-import pandas as pd
-from collections import Counter
-from sklearn.cluster import KMeans
-from sklearn.metrics import silhouette_score
-from sentence_transformers import SentenceTransformer
-from nltk.corpus import stopwords
-import nltk
-import gradio as gr
-# ----------------------------
-# 📦 Setup
-# ----------------------------
-nltk.download('stopwords', quiet=True)
-STOPWORDS = set(stopwords.words('english'))
-# ----------------------------
-# 📘 PDF Text Extraction
-# ----------------------------
-def extract_text_from_pdf(pdf_file):
-    """Extract text from uploaded PDF file"""
-    text = ""
-    with fitz.open(stream=pdf_file.read(), filetype="pdf") as doc:
-        for page in doc:
-            text += page.get_text("text")
-    return text.strip()
-# ----------------------------
-# 🧹 Text Cleaning
-# ----------------------------
-def clean_text(text):
-    """Clean and remove stopwords"""
-    text = re.sub(r"[^a-zA-Z ]", " ", text)
-    words = [w.lower() for w in text.split() if w.lower() not in STOPWORDS and len(w) > 2]
-    return words
-# ----------------------------
-# 🤖 Topic Modeling Function
-# ----------------------------
-def transformer_topic_modeling(sentences, auto_topics=True, max_k=8, fixed_k=5):
-    """Cluster sentences into topics using transformer embeddings"""
-    model = SentenceTransformer('flax-sentence-embeddings/multi-qa_v1-distilbert-cls_dot')
-    embeddings = model.encode(sentences, show_progress_bar=False)
-    # --- Auto-select topic number ---
-    if auto_topics:
-        if len(sentences) < 3:
-            num_topics = 1
-        else:
-            scores = []
-            for k in range(2, min(max_k, len(sentences))):
-                kmeans = KMeans(n_clusters=k, random_state=42, n_init=10).fit(embeddings)
-                try:
-                    score = silhouette_score(embeddings, kmeans.labels_)
-                    scores.append((k, score))
-                except:
-                    continue
-            num_topics = max(scores, key=lambda x: x[1])[0] if scores else 2
-    else:
-        num_topics = fixed_k
-    # --- Clustering ---
-    kmeans = KMeans(n_clusters=num_topics, random_state=42, n_init=10)
-    kmeans.fit(embeddings)
-    df = pd.DataFrame({"Sentence": sentences, "Topic": kmeans.labels_})
-    # --- Build topic summaries ---
-    topic_data = []
-    for topic_id in range(num_topics):
-        topic_sentences = df[df["Topic"] == topic_id]["Sentence"].tolist()
-        words = []
-        for s in topic_sentences:
-            words.extend(clean_text(s))
-        word_freq = Counter(words)
-        top_words = [w for w, _ in word_freq.most_common(3)]
-        title = " & ".join(top_words).capitalize() if top_words else "Miscellaneous"
-        examples = topic_sentences[:3]
-        topic_data.append((f"Topic {topic_id + 1}: {title}", "\n".join(examples)))
-    return topic_data, num_topics
-# ----------------------------
-# 🚀 Gradio Interface Logic
-# ----------------------------
-def analyze_input(pdf_file, essay_text):
-    pdf_text = ""
-    if pdf_file:
-        pdf_text = extract_text_from_pdf(pdf_file)
-    full_text = (pdf_text + "\n" + (essay_text or "")).strip()
-    if not full_text:
-        return "❌ Please upload a PDF or write an essay."
-    sentences = [s.strip() for s in re.split(r'[.!?]', full_text) if len(s.strip()) > 20]
-    if len(sentences) < 2:
-        return "⚠️ Not enough text for topic modeling."
-    topic_data, num_topics = transformer_topic_modeling(sentences, auto_topics=True)
-    # --- Display output ---
-    output_text = f"✅ **Detected {num_topics} Topics:**\n\n"
-    for title, examples in topic_data:
-        output_text += f"### {title}\n{examples}\n\n"
-    return output_text
-# ----------------------------
-# 🎨 Gradio Interface
-# ----------------------------
-demo = gr.Interface(
-    fn=analyze_input,
-    inputs=[
-        gr.File(label="📂 Upload PDF (optional)"),
-        gr.Textbox(lines=10, placeholder="✍️ Write or paste your essay here...", label="Essay Text")
-    ],
-    outputs=gr.Markdown(label="🧠 Detected Topics"),
-    title="PDF + Essay Topic Discovery (Transformer-Based)",
-    description="Upload a PDF and/or write an essay. The system identifies and summarizes main topics using transformer embeddings."
-)
-if __name__ == "__main__":
-    demo.launch()

+import gradio as gr
+import re
+import fitz  # PyMuPDF for PDF extraction
+import pandas as pd
+import numpy as np
+from sklearn.cluster import KMeans
+from sentence_transformers import SentenceTransformer
+# ---------- Helper: extract text from PDF ----------
+def extract_text_from_pdf(pdf_path):
+    text = ""
+    with fitz.open(pdf_path) as doc:
+        for page in doc:
+            text += page.get_text()
+    return text
+# ---------- Helper: Transformer Topic Modeling ----------
+def transformer_topic_modeling(sentences, auto_topics=True, num_topics=5):
+    print("🔹 Using Transformer-based Embeddings...")
+    model = SentenceTransformer("flax-sentence-embeddings/multi-qa_v1-distilbert-cls_dot")
+    embeddings = model.encode(sentences)
+    # Auto-detect number of topics
+    if auto_topics:
+        distortions = []
+        K = range(2, min(10, len(sentences)//2 + 2))
+        for k in K:
+            km = KMeans(n_clusters=k, random_state=42).fit(embeddings)
+            distortions.append(km.inertia_)
+        diffs = np.diff(distortions)
+        num_topics = K[np.argmin(diffs)] if len(diffs) > 0 else 3
+    kmeans = KMeans(n_clusters=num_topics, random_state=42)
+    labels = kmeans.fit_predict(embeddings)
+    df = pd.DataFrame({"Sentence": sentences, "Topic": labels})
+    topics = []
+    for i in range(num_topics):
+        topic_sentences = df[df["Topic"] == i]["Sentence"].tolist()
+        joined_text = " ".join(topic_sentences)
+        words = re.findall(r"\b\w+\b", joined_text.lower())
+        top_words = pd.Series(words).value_counts().head(3).index.tolist()
+        title = " & ".join(top_words).title()
+        topics.append((title, " ".join(topic_sentences[:3])))
+    return topics, num_topics
+# ---------- Main Function ----------
+def analyze_input(pdf_file, essay_text):
+    try:
+        pdf_text = ""
+        if pdf_file:
+            pdf_text = extract_text_from_pdf(pdf_file.name)
+            print("✅ PDF extracted successfully, length:", len(pdf_text))
+        full_text = (pdf_text + "\n" + (essay_text or "")).strip()
+        if not full_text:
+            return "❌ Please upload a PDF or write an essay."
+        sentences = [s.strip() for s in re.split(r'[.!?]', full_text) if len(s.strip()) > 20]
+        print("🧾 Sentence count:", len(sentences))
+        if len(sentences) < 2:
+            return "⚠️ Not enough text for topic modeling."
+        topic_data, num_topics = transformer_topic_modeling(sentences, auto_topics=True)
+        print("✅ Topics discovered:", num_topics)
+        # Build Markdown output (Gradio-safe)
+        output_lines = [f"✅ **Detected {num_topics} Topics:**\n"]
+        for i, (title, examples) in enumerate(topic_data, 1):
+            output_lines.append(f"**Topic {i}: {title}**\n{examples}\n")
+        result = "\n\n".join(output_lines)
+        return result  # ✅ Must return a string
+    except Exception as e:
+        import traceback
+        print(traceback.format_exc())  # full error log for Hugging Face
+        return f"⚠️ Error: {str(e)}"
+# ---------- Gradio UI ----------
+demo = gr.Interface(
+    fn=analyze_input,
+    inputs=[
+        gr.File(label="📂 Upload a PDF (optional)"),
+        gr.Textbox(label="📝 Essay Text", lines=7, placeholder="Write or paste your essay here...")
+    ],
+    outputs=gr.Markdown(label="🧠 Topic Analysis Result"),
+    title="Topic Modeling App",
+    description="Upload a PDF and/or write an essay. The system identifies and summarizes main topics using transformer embeddings."
+)
+if __name__ == "__main__":
+    demo.launch()