Spaces:

anujjuna
/

BERTopic_Agentic_AI

Sleeping

App Files Files Community

anujjuna commited on Apr 14

Commit

7adf08f

verified ·

1 Parent(s): 8d9f7e0

Update tools.py

Browse files

Files changed (1) hide show

tools.py +26 -3

tools.py CHANGED Viewed

@@ -114,14 +114,37 @@ def _split_sentences(text):
     return list(filter(lambda s: len(s) > 20, cleaned))
 # ── Tool 1: load_scopus_csv ────────────────────────────────────────────────────
 @tool
 def load_scopus_csv(filepath: str) -> str:
     """Load a Scopus CSV export, count papers and sentences, apply boilerplate filtering.
     Returns stats string with paper count, abstract sentence count, title sentence count.
     filepath: path to the uploaded CSV file."""
-    df = pd.read_csv(filepath, encoding="utf-8-8-sig")
-    df.to_csv(CSV_PATH, index=False)
     paper_count = len(df)
     abstract_sentences = list(
@@ -158,7 +181,7 @@ def run_bertopic_discovery(run_key: str, threshold: float = 0.7) -> str:
     """Embed sentences with all-MiniLM-L6-v2, cluster with AgglomerativeClustering (cosine metric),
     find 5 nearest centroids per cluster, generate 4 Plotly charts. Save summaries.json + emb.npy.
     run_key: 'abstract' or 'title'. threshold: clustering distance threshold (default 0.7)."""
-    df = pd.read_csv(CSV_PATH, encoding="utf-8-8-sig")
     columns = RUN_CONFIGS[run_key]
     texts = sum(

     return list(filter(lambda s: len(s) > 20, cleaned))
+# ── Encoding helper ───────────────────────────────────────────────────────────
+def _try_read_csv(filepath, enc):
+    """Return DataFrame if encoding works, else None."""
+    result = [None]
+    def _read():
+        result[0] = pd.read_csv(filepath, encoding=enc, on_bad_lines="skip")
+    import contextlib, io
+    with contextlib.suppress(Exception):
+        _read()
+    return result[0]
 # ── Tool 1: load_scopus_csv ────────────────────────────────────────────────────
 @tool
 def load_scopus_csv(filepath: str) -> str:
     """Load a Scopus CSV export, count papers and sentences, apply boilerplate filtering.
     Returns stats string with paper count, abstract sentence count, title sentence count.
     filepath: path to the uploaded CSV file."""
+    # Auto-detect encoding: covers utf-8-sig (BOM), plain utf-8, latin-1, windows-1252
+    encodings = ["utf-8-sig", "utf-8", "latin-1", "cp1252", "iso-8859-1"]
+    df = None
+    detected_enc = None
+    for enc in encodings:
+        candidate = _try_read_csv(filepath, enc)
+        if candidate is not None and len(candidate) > 0:
+            df = candidate
+            detected_enc = enc
+            break
+    if df is None:
+        return "❌ Could not read CSV with any supported encoding. Please re-save as UTF-8 and re-upload."
+    df.to_csv(CSV_PATH, index=False, encoding="utf-8")
     paper_count = len(df)
     abstract_sentences = list(
     """Embed sentences with all-MiniLM-L6-v2, cluster with AgglomerativeClustering (cosine metric),
     find 5 nearest centroids per cluster, generate 4 Plotly charts. Save summaries.json + emb.npy.
     run_key: 'abstract' or 'title'. threshold: clustering distance threshold (default 0.7)."""
+    df = pd.read_csv(CSV_PATH, encoding="utf-8")
     columns = RUN_CONFIGS[run_key]
     texts = sum(