anujjuna commited on
Commit
dc452bc
Β·
verified Β·
1 Parent(s): 079d3be

Update tools.py

Browse files
Files changed (1) hide show
  1. tools.py +17 -8
tools.py CHANGED
@@ -150,9 +150,11 @@ def load_scopus_csv(filepath: str) -> str:
150
  abstract_sentences = list(
151
  filter(None, sum(map(_split_sentences, df["Abstract"].dropna().tolist()), []))
152
  )
153
- title_sentences = list(
154
- filter(None, sum(map(_split_sentences, df["Title"].dropna().tolist()), []))
155
- )
 
 
156
 
157
  stats = {
158
  "papers": paper_count,
@@ -168,7 +170,7 @@ def load_scopus_csv(filepath: str) -> str:
168
  f"βœ… CSV loaded successfully.\n"
169
  f"πŸ“„ Papers: {paper_count}\n"
170
  f"πŸ“ Abstract sentences (after cleaning): {len(abstract_sentences)}\n"
171
- f"πŸ”€ Title sentences (after cleaning): {len(title_sentences)}\n"
172
  f"πŸ“… Year range: {stats['year_range']}\n"
173
  f"πŸ“Š Columns: {', '.join(stats['columns'])}\n\n"
174
  f"Data is ready. Please type **'run abstract'** to begin Phase 2 BERTopic analysis on abstracts."
@@ -187,10 +189,17 @@ def run_bertopic_discovery(run_key: str, threshold: float = 0.7) -> str:
187
  texts = sum(
188
  list(map(lambda col: df[col].dropna().tolist(), columns)), []
189
  )
190
- sentences = list(
191
- filter(lambda s: len(s) > 20,
192
- sum(list(map(_split_sentences, texts)), []))
193
- )
 
 
 
 
 
 
 
194
 
195
  model = SentenceTransformer("all-MiniLM-L6-v2")
196
  embeddings = model.encode(sentences, normalize_embeddings=True, show_progress_bar=False)
 
150
  abstract_sentences = list(
151
  filter(None, sum(map(_split_sentences, df["Abstract"].dropna().tolist()), []))
152
  )
153
+ # Titles are atomic units β€” count each non-empty title as one unit (no sent_tokenize)
154
+ title_sentences = list(filter(
155
+ lambda s: len(s.strip()) >= 5,
156
+ list(map(lambda t: _clean_sentence(str(t)), df["Title"].dropna().tolist()))
157
+ ))
158
 
159
  stats = {
160
  "papers": paper_count,
 
170
  f"βœ… CSV loaded successfully.\n"
171
  f"πŸ“„ Papers: {paper_count}\n"
172
  f"πŸ“ Abstract sentences (after cleaning): {len(abstract_sentences)}\n"
173
+ f"πŸ”€ Title records (after cleaning): {len(title_sentences)}\n"
174
  f"πŸ“… Year range: {stats['year_range']}\n"
175
  f"πŸ“Š Columns: {', '.join(stats['columns'])}\n\n"
176
  f"Data is ready. Please type **'run abstract'** to begin Phase 2 BERTopic analysis on abstracts."
 
189
  texts = sum(
190
  list(map(lambda col: df[col].dropna().tolist(), columns)), []
191
  )
192
+
193
+ # Titles are already single semantic units β€” do NOT split into sentences.
194
+ # Abstracts get split into sentences for finer-grained clustering.
195
+ # Min-length: 5 chars for titles, 20 chars for abstract sentences.
196
+ sentences = list(filter(
197
+ lambda s: len(s.strip()) >= 5,
198
+ list(map(lambda t: _clean_sentence(str(t)), texts))
199
+ )) if run_key == "title" else list(filter(
200
+ lambda s: len(s) > 20,
201
+ sum(list(map(_split_sentences, texts)), [])
202
+ ))
203
 
204
  model = SentenceTransformer("all-MiniLM-L6-v2")
205
  embeddings = model.encode(sentences, normalize_embeddings=True, show_progress_bar=False)