Mooo-osama03 commited on
Commit
bd7a3b4
·
verified ·
1 Parent(s): 6d5ccf0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +18 -6
app.py CHANGED
@@ -5,7 +5,12 @@ import pandas as pd
5
  import numpy as np
6
  from sklearn.cluster import KMeans
7
  from sentence_transformers import SentenceTransformer
 
 
8
 
 
 
 
9
 
10
  # ---------- Helper: extract text from PDF ----------
11
  def extract_text_from_pdf(pdf_path):
@@ -41,8 +46,15 @@ def transformer_topic_modeling(sentences, auto_topics=True, num_topics=5):
41
  for i in range(num_topics):
42
  topic_sentences = df[df["Topic"] == i]["Sentence"].tolist()
43
  joined_text = " ".join(topic_sentences)
44
- words = re.findall(r"\b\w+\b", joined_text.lower())
45
- top_words = pd.Series(words).value_counts().head(3).index.tolist()
 
 
 
 
 
 
 
46
  title = " & ".join(top_words).title()
47
  topics.append((title, " ".join(topic_sentences[:3])))
48
 
@@ -70,17 +82,17 @@ def analyze_input(pdf_file, essay_text):
70
  topic_data, num_topics = transformer_topic_modeling(sentences, auto_topics=True)
71
  print("✅ Topics discovered:", num_topics)
72
 
73
- # Build Markdown output (Gradio-safe)
74
  output_lines = [f"✅ **Detected {num_topics} Topics:**\n"]
75
  for i, (title, examples) in enumerate(topic_data, 1):
76
  output_lines.append(f"**Topic {i}: {title}**\n{examples}\n")
77
  result = "\n\n".join(output_lines)
78
 
79
- return result # ✅ Must return a string
80
 
81
  except Exception as e:
82
  import traceback
83
- print(traceback.format_exc()) # full error log for Hugging Face
84
  return f"⚠️ Error: {str(e)}"
85
 
86
 
@@ -92,7 +104,7 @@ demo = gr.Interface(
92
  gr.Textbox(label="📝 Essay Text", lines=7, placeholder="Write or paste your essay here...")
93
  ],
94
  outputs=gr.Markdown(label="🧠 Topic Analysis Result"),
95
- title="Topic Modeling App",
96
  description="Upload a PDF and/or write an essay. The system identifies and summarizes main topics using transformer embeddings."
97
  )
98
 
 
5
  import numpy as np
6
  from sklearn.cluster import KMeans
7
  from sentence_transformers import SentenceTransformer
8
+ import nltk
9
+ from nltk.corpus import stopwords
10
 
11
+ # ---------- Setup ----------
12
+ nltk.download('stopwords', quiet=True)
13
+ stop_words = set(stopwords.words('english'))
14
 
15
  # ---------- Helper: extract text from PDF ----------
16
  def extract_text_from_pdf(pdf_path):
 
46
  for i in range(num_topics):
47
  topic_sentences = df[df["Topic"] == i]["Sentence"].tolist()
48
  joined_text = " ".join(topic_sentences)
49
+
50
+ # --- Extract keywords excluding stopwords ---
51
+ words = re.findall(r"\b[a-z]{3,}\b", joined_text.lower())
52
+ filtered = [w for w in words if w not in stop_words]
53
+ if filtered:
54
+ top_words = pd.Series(filtered).value_counts().head(3).index.tolist()
55
+ else:
56
+ top_words = ["General"]
57
+
58
  title = " & ".join(top_words).title()
59
  topics.append((title, " ".join(topic_sentences[:3])))
60
 
 
82
  topic_data, num_topics = transformer_topic_modeling(sentences, auto_topics=True)
83
  print("✅ Topics discovered:", num_topics)
84
 
85
+ # Build Markdown output for Gradio
86
  output_lines = [f"✅ **Detected {num_topics} Topics:**\n"]
87
  for i, (title, examples) in enumerate(topic_data, 1):
88
  output_lines.append(f"**Topic {i}: {title}**\n{examples}\n")
89
  result = "\n\n".join(output_lines)
90
 
91
+ return result # ✅ Return string only
92
 
93
  except Exception as e:
94
  import traceback
95
+ print(traceback.format_exc()) # full log in Hugging Face console
96
  return f"⚠️ Error: {str(e)}"
97
 
98
 
 
104
  gr.Textbox(label="📝 Essay Text", lines=7, placeholder="Write or paste your essay here...")
105
  ],
106
  outputs=gr.Markdown(label="🧠 Topic Analysis Result"),
107
+ title="Topic Modeling App (PDF + Essay)",
108
  description="Upload a PDF and/or write an essay. The system identifies and summarizes main topics using transformer embeddings."
109
  )
110