Mooo-osama03 commited on
Commit
6d5ccf0
Β·
verified Β·
1 Parent(s): b94af5b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +100 -123
app.py CHANGED
@@ -1,123 +1,100 @@
1
- import re
2
- import fitz # PyMuPDF
3
- import pandas as pd
4
- from collections import Counter
5
- from sklearn.cluster import KMeans
6
- from sklearn.metrics import silhouette_score
7
- from sentence_transformers import SentenceTransformer
8
- from nltk.corpus import stopwords
9
- import nltk
10
- import gradio as gr
11
-
12
- # ----------------------------
13
- # πŸ“¦ Setup
14
- # ----------------------------
15
- nltk.download('stopwords', quiet=True)
16
- STOPWORDS = set(stopwords.words('english'))
17
-
18
- # ----------------------------
19
- # πŸ“˜ PDF Text Extraction
20
- # ----------------------------
21
- def extract_text_from_pdf(pdf_file):
22
- """Extract text from uploaded PDF file"""
23
- text = ""
24
- with fitz.open(stream=pdf_file.read(), filetype="pdf") as doc:
25
- for page in doc:
26
- text += page.get_text("text")
27
- return text.strip()
28
-
29
- # ----------------------------
30
- # 🧹 Text Cleaning
31
- # ----------------------------
32
- def clean_text(text):
33
- """Clean and remove stopwords"""
34
- text = re.sub(r"[^a-zA-Z ]", " ", text)
35
- words = [w.lower() for w in text.split() if w.lower() not in STOPWORDS and len(w) > 2]
36
- return words
37
-
38
- # ----------------------------
39
- # πŸ€– Topic Modeling Function
40
- # ----------------------------
41
- def transformer_topic_modeling(sentences, auto_topics=True, max_k=8, fixed_k=5):
42
- """Cluster sentences into topics using transformer embeddings"""
43
- model = SentenceTransformer('flax-sentence-embeddings/multi-qa_v1-distilbert-cls_dot')
44
- embeddings = model.encode(sentences, show_progress_bar=False)
45
-
46
- # --- Auto-select topic number ---
47
- if auto_topics:
48
- if len(sentences) < 3:
49
- num_topics = 1
50
- else:
51
- scores = []
52
- for k in range(2, min(max_k, len(sentences))):
53
- kmeans = KMeans(n_clusters=k, random_state=42, n_init=10).fit(embeddings)
54
- try:
55
- score = silhouette_score(embeddings, kmeans.labels_)
56
- scores.append((k, score))
57
- except:
58
- continue
59
- num_topics = max(scores, key=lambda x: x[1])[0] if scores else 2
60
- else:
61
- num_topics = fixed_k
62
-
63
- # --- Clustering ---
64
- kmeans = KMeans(n_clusters=num_topics, random_state=42, n_init=10)
65
- kmeans.fit(embeddings)
66
- df = pd.DataFrame({"Sentence": sentences, "Topic": kmeans.labels_})
67
-
68
- # --- Build topic summaries ---
69
- topic_data = []
70
- for topic_id in range(num_topics):
71
- topic_sentences = df[df["Topic"] == topic_id]["Sentence"].tolist()
72
- words = []
73
- for s in topic_sentences:
74
- words.extend(clean_text(s))
75
- word_freq = Counter(words)
76
- top_words = [w for w, _ in word_freq.most_common(3)]
77
- title = " & ".join(top_words).capitalize() if top_words else "Miscellaneous"
78
- examples = topic_sentences[:3]
79
- topic_data.append((f"Topic {topic_id + 1}: {title}", "\n".join(examples)))
80
-
81
- return topic_data, num_topics
82
-
83
- # ----------------------------
84
- # πŸš€ Gradio Interface Logic
85
- # ----------------------------
86
- def analyze_input(pdf_file, essay_text):
87
- pdf_text = ""
88
- if pdf_file:
89
- pdf_text = extract_text_from_pdf(pdf_file)
90
-
91
- full_text = (pdf_text + "\n" + (essay_text or "")).strip()
92
- if not full_text:
93
- return "❌ Please upload a PDF or write an essay."
94
-
95
- sentences = [s.strip() for s in re.split(r'[.!?]', full_text) if len(s.strip()) > 20]
96
- if len(sentences) < 2:
97
- return "⚠️ Not enough text for topic modeling."
98
-
99
- topic_data, num_topics = transformer_topic_modeling(sentences, auto_topics=True)
100
-
101
- # --- Display output ---
102
- output_text = f"βœ… **Detected {num_topics} Topics:**\n\n"
103
- for title, examples in topic_data:
104
- output_text += f"### {title}\n{examples}\n\n"
105
-
106
- return output_text
107
-
108
- # ----------------------------
109
- # 🎨 Gradio Interface
110
- # ----------------------------
111
- demo = gr.Interface(
112
- fn=analyze_input,
113
- inputs=[
114
- gr.File(label="πŸ“‚ Upload PDF (optional)"),
115
- gr.Textbox(lines=10, placeholder="✍️ Write or paste your essay here...", label="Essay Text")
116
- ],
117
- outputs=gr.Markdown(label="🧠 Detected Topics"),
118
- title="PDF + Essay Topic Discovery (Transformer-Based)",
119
- description="Upload a PDF and/or write an essay. The system identifies and summarizes main topics using transformer embeddings."
120
- )
121
-
122
- if __name__ == "__main__":
123
- demo.launch()
 
1
+ import gradio as gr
2
+ import re
3
+ import fitz # PyMuPDF for PDF extraction
4
+ import pandas as pd
5
+ import numpy as np
6
+ from sklearn.cluster import KMeans
7
+ from sentence_transformers import SentenceTransformer
8
+
9
+
10
+ # ---------- Helper: extract text from PDF ----------
11
+ def extract_text_from_pdf(pdf_path):
12
+ text = ""
13
+ with fitz.open(pdf_path) as doc:
14
+ for page in doc:
15
+ text += page.get_text()
16
+ return text
17
+
18
+
19
+ # ---------- Helper: Transformer Topic Modeling ----------
20
+ def transformer_topic_modeling(sentences, auto_topics=True, num_topics=5):
21
+ print("πŸ”Ή Using Transformer-based Embeddings...")
22
+ model = SentenceTransformer("flax-sentence-embeddings/multi-qa_v1-distilbert-cls_dot")
23
+
24
+ embeddings = model.encode(sentences)
25
+
26
+ # Auto-detect number of topics
27
+ if auto_topics:
28
+ distortions = []
29
+ K = range(2, min(10, len(sentences)//2 + 2))
30
+ for k in K:
31
+ km = KMeans(n_clusters=k, random_state=42).fit(embeddings)
32
+ distortions.append(km.inertia_)
33
+ diffs = np.diff(distortions)
34
+ num_topics = K[np.argmin(diffs)] if len(diffs) > 0 else 3
35
+
36
+ kmeans = KMeans(n_clusters=num_topics, random_state=42)
37
+ labels = kmeans.fit_predict(embeddings)
38
+ df = pd.DataFrame({"Sentence": sentences, "Topic": labels})
39
+
40
+ topics = []
41
+ for i in range(num_topics):
42
+ topic_sentences = df[df["Topic"] == i]["Sentence"].tolist()
43
+ joined_text = " ".join(topic_sentences)
44
+ words = re.findall(r"\b\w+\b", joined_text.lower())
45
+ top_words = pd.Series(words).value_counts().head(3).index.tolist()
46
+ title = " & ".join(top_words).title()
47
+ topics.append((title, " ".join(topic_sentences[:3])))
48
+
49
+ return topics, num_topics
50
+
51
+
52
+ # ---------- Main Function ----------
53
+ def analyze_input(pdf_file, essay_text):
54
+ try:
55
+ pdf_text = ""
56
+ if pdf_file:
57
+ pdf_text = extract_text_from_pdf(pdf_file.name)
58
+ print("βœ… PDF extracted successfully, length:", len(pdf_text))
59
+
60
+ full_text = (pdf_text + "\n" + (essay_text or "")).strip()
61
+ if not full_text:
62
+ return "❌ Please upload a PDF or write an essay."
63
+
64
+ sentences = [s.strip() for s in re.split(r'[.!?]', full_text) if len(s.strip()) > 20]
65
+ print("🧾 Sentence count:", len(sentences))
66
+
67
+ if len(sentences) < 2:
68
+ return "⚠️ Not enough text for topic modeling."
69
+
70
+ topic_data, num_topics = transformer_topic_modeling(sentences, auto_topics=True)
71
+ print("βœ… Topics discovered:", num_topics)
72
+
73
+ # Build Markdown output (Gradio-safe)
74
+ output_lines = [f"βœ… **Detected {num_topics} Topics:**\n"]
75
+ for i, (title, examples) in enumerate(topic_data, 1):
76
+ output_lines.append(f"**Topic {i}: {title}**\n{examples}\n")
77
+ result = "\n\n".join(output_lines)
78
+
79
+ return result # βœ… Must return a string
80
+
81
+ except Exception as e:
82
+ import traceback
83
+ print(traceback.format_exc()) # full error log for Hugging Face
84
+ return f"⚠️ Error: {str(e)}"
85
+
86
+
87
+ # ---------- Gradio UI ----------
88
+ demo = gr.Interface(
89
+ fn=analyze_input,
90
+ inputs=[
91
+ gr.File(label="πŸ“‚ Upload a PDF (optional)"),
92
+ gr.Textbox(label="πŸ“ Essay Text", lines=7, placeholder="Write or paste your essay here...")
93
+ ],
94
+ outputs=gr.Markdown(label="🧠 Topic Analysis Result"),
95
+ title="Topic Modeling App",
96
+ description="Upload a PDF and/or write an essay. The system identifies and summarizes main topics using transformer embeddings."
97
+ )
98
+
99
+ if __name__ == "__main__":
100
+ demo.launch()