Spaces:
Sleeping
Sleeping
Update tools.py
Browse files
tools.py
CHANGED
|
@@ -114,14 +114,37 @@ def _split_sentences(text):
|
|
| 114 |
return list(filter(lambda s: len(s) > 20, cleaned))
|
| 115 |
|
| 116 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 117 |
# ββ Tool 1: load_scopus_csv ββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 118 |
@tool
|
| 119 |
def load_scopus_csv(filepath: str) -> str:
|
| 120 |
"""Load a Scopus CSV export, count papers and sentences, apply boilerplate filtering.
|
| 121 |
Returns stats string with paper count, abstract sentence count, title sentence count.
|
| 122 |
filepath: path to the uploaded CSV file."""
|
| 123 |
-
|
| 124 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 125 |
|
| 126 |
paper_count = len(df)
|
| 127 |
abstract_sentences = list(
|
|
@@ -158,7 +181,7 @@ def run_bertopic_discovery(run_key: str, threshold: float = 0.7) -> str:
|
|
| 158 |
"""Embed sentences with all-MiniLM-L6-v2, cluster with AgglomerativeClustering (cosine metric),
|
| 159 |
find 5 nearest centroids per cluster, generate 4 Plotly charts. Save summaries.json + emb.npy.
|
| 160 |
run_key: 'abstract' or 'title'. threshold: clustering distance threshold (default 0.7)."""
|
| 161 |
-
df = pd.read_csv(CSV_PATH, encoding="utf-8
|
| 162 |
columns = RUN_CONFIGS[run_key]
|
| 163 |
|
| 164 |
texts = sum(
|
|
|
|
| 114 |
return list(filter(lambda s: len(s) > 20, cleaned))
|
| 115 |
|
| 116 |
|
| 117 |
+
# ββ Encoding helper βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 118 |
+
def _try_read_csv(filepath, enc):
|
| 119 |
+
"""Return DataFrame if encoding works, else None."""
|
| 120 |
+
result = [None]
|
| 121 |
+
def _read():
|
| 122 |
+
result[0] = pd.read_csv(filepath, encoding=enc, on_bad_lines="skip")
|
| 123 |
+
import contextlib, io
|
| 124 |
+
with contextlib.suppress(Exception):
|
| 125 |
+
_read()
|
| 126 |
+
return result[0]
|
| 127 |
+
|
| 128 |
+
|
| 129 |
# ββ Tool 1: load_scopus_csv ββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 130 |
@tool
|
| 131 |
def load_scopus_csv(filepath: str) -> str:
|
| 132 |
"""Load a Scopus CSV export, count papers and sentences, apply boilerplate filtering.
|
| 133 |
Returns stats string with paper count, abstract sentence count, title sentence count.
|
| 134 |
filepath: path to the uploaded CSV file."""
|
| 135 |
+
# Auto-detect encoding: covers utf-8-sig (BOM), plain utf-8, latin-1, windows-1252
|
| 136 |
+
encodings = ["utf-8-sig", "utf-8", "latin-1", "cp1252", "iso-8859-1"]
|
| 137 |
+
df = None
|
| 138 |
+
detected_enc = None
|
| 139 |
+
for enc in encodings:
|
| 140 |
+
candidate = _try_read_csv(filepath, enc)
|
| 141 |
+
if candidate is not None and len(candidate) > 0:
|
| 142 |
+
df = candidate
|
| 143 |
+
detected_enc = enc
|
| 144 |
+
break
|
| 145 |
+
if df is None:
|
| 146 |
+
return "β Could not read CSV with any supported encoding. Please re-save as UTF-8 and re-upload."
|
| 147 |
+
df.to_csv(CSV_PATH, index=False, encoding="utf-8")
|
| 148 |
|
| 149 |
paper_count = len(df)
|
| 150 |
abstract_sentences = list(
|
|
|
|
| 181 |
"""Embed sentences with all-MiniLM-L6-v2, cluster with AgglomerativeClustering (cosine metric),
|
| 182 |
find 5 nearest centroids per cluster, generate 4 Plotly charts. Save summaries.json + emb.npy.
|
| 183 |
run_key: 'abstract' or 'title'. threshold: clustering distance threshold (default 0.7)."""
|
| 184 |
+
df = pd.read_csv(CSV_PATH, encoding="utf-8")
|
| 185 |
columns = RUN_CONFIGS[run_key]
|
| 186 |
|
| 187 |
texts = sum(
|