anujjuna commited on
Commit
7adf08f
Β·
verified Β·
1 Parent(s): 8d9f7e0

Update tools.py

Browse files
Files changed (1) hide show
  1. tools.py +26 -3
tools.py CHANGED
@@ -114,14 +114,37 @@ def _split_sentences(text):
114
  return list(filter(lambda s: len(s) > 20, cleaned))
115
 
116
 
 
 
 
 
 
 
 
 
 
 
 
 
117
  # ── Tool 1: load_scopus_csv ────────────────────────────────────────────────────
118
  @tool
119
  def load_scopus_csv(filepath: str) -> str:
120
  """Load a Scopus CSV export, count papers and sentences, apply boilerplate filtering.
121
  Returns stats string with paper count, abstract sentence count, title sentence count.
122
  filepath: path to the uploaded CSV file."""
123
- df = pd.read_csv(filepath, encoding="utf-8-8-sig")
124
- df.to_csv(CSV_PATH, index=False)
 
 
 
 
 
 
 
 
 
 
 
125
 
126
  paper_count = len(df)
127
  abstract_sentences = list(
@@ -158,7 +181,7 @@ def run_bertopic_discovery(run_key: str, threshold: float = 0.7) -> str:
158
  """Embed sentences with all-MiniLM-L6-v2, cluster with AgglomerativeClustering (cosine metric),
159
  find 5 nearest centroids per cluster, generate 4 Plotly charts. Save summaries.json + emb.npy.
160
  run_key: 'abstract' or 'title'. threshold: clustering distance threshold (default 0.7)."""
161
- df = pd.read_csv(CSV_PATH, encoding="utf-8-8-sig")
162
  columns = RUN_CONFIGS[run_key]
163
 
164
  texts = sum(
 
114
  return list(filter(lambda s: len(s) > 20, cleaned))
115
 
116
 
117
+ # ── Encoding helper ───────────────────────────────────────────────────────────
118
+ def _try_read_csv(filepath, enc):
119
+ """Return DataFrame if encoding works, else None."""
120
+ result = [None]
121
+ def _read():
122
+ result[0] = pd.read_csv(filepath, encoding=enc, on_bad_lines="skip")
123
+ import contextlib, io
124
+ with contextlib.suppress(Exception):
125
+ _read()
126
+ return result[0]
127
+
128
+
129
  # ── Tool 1: load_scopus_csv ────────────────────────────────────────────────────
130
  @tool
131
  def load_scopus_csv(filepath: str) -> str:
132
  """Load a Scopus CSV export, count papers and sentences, apply boilerplate filtering.
133
  Returns stats string with paper count, abstract sentence count, title sentence count.
134
  filepath: path to the uploaded CSV file."""
135
+ # Auto-detect encoding: covers utf-8-sig (BOM), plain utf-8, latin-1, windows-1252
136
+ encodings = ["utf-8-sig", "utf-8", "latin-1", "cp1252", "iso-8859-1"]
137
+ df = None
138
+ detected_enc = None
139
+ for enc in encodings:
140
+ candidate = _try_read_csv(filepath, enc)
141
+ if candidate is not None and len(candidate) > 0:
142
+ df = candidate
143
+ detected_enc = enc
144
+ break
145
+ if df is None:
146
+ return "❌ Could not read CSV with any supported encoding. Please re-save as UTF-8 and re-upload."
147
+ df.to_csv(CSV_PATH, index=False, encoding="utf-8")
148
 
149
  paper_count = len(df)
150
  abstract_sentences = list(
 
181
  """Embed sentences with all-MiniLM-L6-v2, cluster with AgglomerativeClustering (cosine metric),
182
  find 5 nearest centroids per cluster, generate 4 Plotly charts. Save summaries.json + emb.npy.
183
  run_key: 'abstract' or 'title'. threshold: clustering distance threshold (default 0.7)."""
184
+ df = pd.read_csv(CSV_PATH, encoding="utf-8")
185
  columns = RUN_CONFIGS[run_key]
186
 
187
  texts = sum(