Spaces:
Running
Running
Update tools.py
Browse files
tools.py
CHANGED
|
@@ -150,9 +150,11 @@ def load_scopus_csv(filepath: str) -> str:
|
|
| 150 |
abstract_sentences = list(
|
| 151 |
filter(None, sum(map(_split_sentences, df["Abstract"].dropna().tolist()), []))
|
| 152 |
)
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
|
|
|
|
|
|
| 156 |
|
| 157 |
stats = {
|
| 158 |
"papers": paper_count,
|
|
@@ -168,7 +170,7 @@ def load_scopus_csv(filepath: str) -> str:
|
|
| 168 |
f"β
CSV loaded successfully.\n"
|
| 169 |
f"π Papers: {paper_count}\n"
|
| 170 |
f"π Abstract sentences (after cleaning): {len(abstract_sentences)}\n"
|
| 171 |
-
f"π€ Title
|
| 172 |
f"π
Year range: {stats['year_range']}\n"
|
| 173 |
f"π Columns: {', '.join(stats['columns'])}\n\n"
|
| 174 |
f"Data is ready. Please type **'run abstract'** to begin Phase 2 BERTopic analysis on abstracts."
|
|
@@ -187,10 +189,17 @@ def run_bertopic_discovery(run_key: str, threshold: float = 0.7) -> str:
|
|
| 187 |
texts = sum(
|
| 188 |
list(map(lambda col: df[col].dropna().tolist(), columns)), []
|
| 189 |
)
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 194 |
|
| 195 |
model = SentenceTransformer("all-MiniLM-L6-v2")
|
| 196 |
embeddings = model.encode(sentences, normalize_embeddings=True, show_progress_bar=False)
|
|
|
|
| 150 |
abstract_sentences = list(
|
| 151 |
filter(None, sum(map(_split_sentences, df["Abstract"].dropna().tolist()), []))
|
| 152 |
)
|
| 153 |
+
# Titles are atomic units β count each non-empty title as one unit (no sent_tokenize)
|
| 154 |
+
title_sentences = list(filter(
|
| 155 |
+
lambda s: len(s.strip()) >= 5,
|
| 156 |
+
list(map(lambda t: _clean_sentence(str(t)), df["Title"].dropna().tolist()))
|
| 157 |
+
))
|
| 158 |
|
| 159 |
stats = {
|
| 160 |
"papers": paper_count,
|
|
|
|
| 170 |
f"β
CSV loaded successfully.\n"
|
| 171 |
f"π Papers: {paper_count}\n"
|
| 172 |
f"π Abstract sentences (after cleaning): {len(abstract_sentences)}\n"
|
| 173 |
+
f"π€ Title records (after cleaning): {len(title_sentences)}\n"
|
| 174 |
f"π
Year range: {stats['year_range']}\n"
|
| 175 |
f"π Columns: {', '.join(stats['columns'])}\n\n"
|
| 176 |
f"Data is ready. Please type **'run abstract'** to begin Phase 2 BERTopic analysis on abstracts."
|
|
|
|
| 189 |
texts = sum(
|
| 190 |
list(map(lambda col: df[col].dropna().tolist(), columns)), []
|
| 191 |
)
|
| 192 |
+
|
| 193 |
+
# Titles are already single semantic units β do NOT split into sentences.
|
| 194 |
+
# Abstracts get split into sentences for finer-grained clustering.
|
| 195 |
+
# Min-length: 5 chars for titles, 20 chars for abstract sentences.
|
| 196 |
+
sentences = list(filter(
|
| 197 |
+
lambda s: len(s.strip()) >= 5,
|
| 198 |
+
list(map(lambda t: _clean_sentence(str(t)), texts))
|
| 199 |
+
)) if run_key == "title" else list(filter(
|
| 200 |
+
lambda s: len(s) > 20,
|
| 201 |
+
sum(list(map(_split_sentences, texts)), [])
|
| 202 |
+
))
|
| 203 |
|
| 204 |
model = SentenceTransformer("all-MiniLM-L6-v2")
|
| 205 |
embeddings = model.encode(sentences, normalize_embeddings=True, show_progress_bar=False)
|