abhi1294 commited on
Commit
5b48ced
·
1 Parent(s): 64f6aad

Fix prompts and utils

Browse files
Files changed (1) hide show
  1. tools.py +33 -268
tools.py CHANGED
@@ -1,224 +1,38 @@
1
- # from __future__ import annotations
2
- # import io
3
- # import json
4
- # import os
5
- # from pathlib import Path
6
- # from typing import Optional
7
- # import pandas as pd
8
- # import requests
9
-
10
- # class TaskFileTool:
11
- # """
12
- # Downloads and reads task-linked files from the Hugging Face
13
- # Unit 4 scoring API.
14
-
15
- # Supported text extration:
16
- # - txt
17
- # - csv
18
- # - json
19
- # - md
20
- # - html
21
- # - xml
22
-
23
- # For unsupported or binary files, it safely returns an empty string for now.
24
- # We can extend this later for PDF/images if needed.
25
- # """
26
-
27
- # def __init__(self, api_base_url: str, cache_dir:str = "task_files", timeout: int =30):
28
- # self.api_base_url = api_base_url.rstrip("/")
29
- # self.cache_dir = Path(cache_dir)
30
- # self.cache_dir.mkdir(parents=True, exist_ok=True)
31
- # self.timeout = timeout
32
-
33
- # def get_task_context(self, task_id: str) -> str:
34
- # """
35
- # Main entry point used by the agent:
36
- # 1. download the task file if present
37
- # 2. read it into text context if supported
38
- # """
39
- # file_path = self.download_task_file(task_id)
40
- # if file_path is None:
41
- # return ""
42
- # return self.read_file_as_text(file_path)
43
-
44
- # def download_task_file(self, task_id: str) -> Optional[Path]:
45
- # """
46
- # Downloads the file linked to a task_id using:
47
- # GET /files/{task_id}
48
-
49
- # Returns:
50
- # Path to saved file if successful, else None
51
- # """
52
- # url = f"{self.api_base_url}/files/{task_id}"
53
-
54
- # try:
55
- # response = requests.get(url, timeout=self.timeout)
56
- # except requests.RequestException:
57
- # return None
58
-
59
- # if response.status_code !=200:
60
- # return None
61
-
62
- # filename = self._infer_filename(response=response, task_id=task_id)
63
- # file_path = self.cache_dir / filename
64
-
65
- # try:
66
- # with open(file_path, "wb") as f:
67
- # f.write(response.content)
68
- # return file_path
69
- # except OSError:
70
- # return None
71
- # return file_path
72
-
73
- # def read_file_as_text(self, file_path: Path) -> str:
74
- # """
75
- # Reads supported file types into plain text.
76
- # """
77
- # suffix = file_path.suffix.lower()
78
-
79
- # try:
80
- # if suffix in {".txt", ".md", ".html", ".xml", ".csv", ".json"}:
81
- # return self._read_supported_text_file(file_path, suffix)
82
-
83
- # # common fallback for files saved without extension but actually text
84
- # if suffix == "":
85
- # return self._read_extensionless_file(file_path)
86
-
87
- # return ""
88
- # except Exception:
89
- # return ""
90
-
91
- # def _read_supported_text_file(self, file_path: Path, suffix: str) -> str:
92
- # if suffix in {".txt", ".md", ".html", ".xml"}:
93
- # return file_path.read_text(encoding="utf-8", errors="ignore")
94
-
95
- # if suffix == ".json":
96
- # raw = file_path.read_text(encoding="utf-8", errors="ignore")
97
- # try:
98
- # parsed = json.loads(raw)
99
- # return json.dumps(parsed, indent=2, ensure_ascii=False)
100
- # except json.JSONDecodeError:
101
- # return raw
102
-
103
- # if suffix == ".csv":
104
- # try:
105
- # df = pd.read_csv(file_path)
106
- # return df.to_csv(index=False)
107
- # except Exception:
108
- # return file_path.read_text(encoding="utf-8", errors="ignore")
109
-
110
- # return ""
111
-
112
- # def _read_extensionless_file(self, file_path: Path) -> str:
113
- # """
114
- # Try to interpret extensionless files as utf-8 text first.
115
- # """
116
- # try:
117
- # raw = file_path.read_text(encoding="utf-8", errors="ignore")
118
- # if raw.strip():
119
- # return raw
120
- # except Exception:
121
- # pass
122
- # return ""
123
-
124
- # def _infer_filename(self, response: requests.Response, task_id: str) -> str:
125
- # """
126
- # Attempts to infer a useful filename from headers.
127
- # Falls back to task_id if no filename is available.
128
- # """
129
- # content_disposition = response.headers.get("content-disposition", "")
130
- # filename = self._extract_filename_from_content_disposition(content_disposition)
131
-
132
- # if filename:
133
- # return self._safe_filename(filename)
134
-
135
- # content_type = response.headers.get("content-type", "").lower()
136
- # extension = self._extension_from_content_type(content_type)
137
-
138
- # if extension:
139
- # return f"{task_id}{extension}"
140
-
141
- # return str(task_id)
142
-
143
- # @staticmethod
144
- # def _extract_filename_from_content_disposition(content_disposition: str) -> Optional[str]:
145
- # """
146
- # Example header:
147
- # content-disposition: attachment; filename="example.csv"
148
- # """
149
- # if "filename=" not in content_disposition:
150
- # return None
151
-
152
- # try:
153
- # filename = content_disposition.split("filename=")[-1].strip().strip('"')
154
- # return filename or None
155
- # except Exception:
156
- # return None
157
-
158
- # @staticmethod
159
- # def _extension_from_content_type(content_type: str) -> str:
160
- # mapping = {
161
- # "text/plain": ".txt",
162
- # "text/csv": ".csv",
163
- # "application/csv": ".csv",
164
- # "application/json": ".json",
165
- # "text/markdown": ".md",
166
- # "text/html": ".html",
167
- # "application/xml": ".xml",
168
- # "text/xml": ".xml",
169
- # }
170
-
171
- # for key, ext in mapping.items():
172
- # if key in content_type:
173
- # return ext
174
-
175
- # return ""
176
-
177
- # @staticmethod
178
- # def _safe_filename(filename: str) -> str:
179
- # """
180
- # Prevent path traversal and weird path issues.
181
- # """
182
- # return os.path.basename(filename)
183
-
184
  from __future__ import annotations
185
-
186
  import json
187
  import os
188
  from pathlib import Path
189
  from typing import Optional
190
-
191
  import pandas as pd
192
  import requests
193
 
194
-
195
  class TaskFileTool:
196
  """
197
- Downloads and reads task-linked files from the Hugging Face Unit 4 scoring API.
 
198
 
199
- Supported extraction:
200
  - txt
 
 
201
  - md
202
  - html
203
  - xml
204
- - csv
205
- - tsv
206
- - json
207
- - xlsx
208
- - xls
209
 
210
- For unsupported or binary files, it safely returns an empty string.
 
211
  """
212
 
213
- def __init__(self, api_base_url: str, cache_dir: str = "task_files", timeout: int = 30):
214
  self.api_base_url = api_base_url.rstrip("/")
215
  self.cache_dir = Path(cache_dir)
216
  self.cache_dir.mkdir(parents=True, exist_ok=True)
217
  self.timeout = timeout
218
-
219
  def get_task_context(self, task_id: str) -> str:
220
  """
221
- Main entry point:
222
  1. download the task file if present
223
  2. read it into text context if supported
224
  """
@@ -231,19 +45,20 @@ class TaskFileTool:
231
  """
232
  Downloads the file linked to a task_id using:
233
  GET /files/{task_id}
 
 
 
234
  """
235
- url = f"{self.api_base_url}/files/{task_id}"
236
 
237
  try:
238
  response = requests.get(url, timeout=self.timeout)
239
- except requests.RequestException as e:
240
- print(f"Task file download error for {task_id}: {e}")
241
  return None
242
-
243
- if response.status_code != 200:
244
- print(f"No task file for {task_id}. Status code: {response.status_code}")
245
  return None
246
-
247
  filename = self._infer_filename(response=response, task_id=task_id)
248
  file_path = self.cache_dir / filename
249
 
@@ -251,10 +66,10 @@ class TaskFileTool:
251
  with open(file_path, "wb") as f:
252
  f.write(response.content)
253
  return file_path
254
- except OSError as e:
255
- print(f"Failed to save task file for {task_id}: {e}")
256
  return None
257
-
 
258
  def read_file_as_text(self, file_path: Path) -> str:
259
  """
260
  Reads supported file types into plain text.
@@ -262,19 +77,15 @@ class TaskFileTool:
262
  suffix = file_path.suffix.lower()
263
 
264
  try:
265
- if suffix in {".txt", ".md", ".html", ".xml", ".csv", ".tsv", ".json"}:
266
  return self._read_supported_text_file(file_path, suffix)
267
 
268
- if suffix in {".xlsx", ".xls"}:
269
- return self._read_excel_file(file_path)
270
-
271
  if suffix == "":
272
  return self._read_extensionless_file(file_path)
273
 
274
- print(f"Unsupported file type: {suffix} for {file_path.name}")
275
  return ""
276
- except Exception as e:
277
- print(f"Failed to read file {file_path.name}: {e}")
278
  return ""
279
 
280
  def _read_supported_text_file(self, file_path: Path, suffix: str) -> str:
@@ -292,62 +103,12 @@ class TaskFileTool:
292
  if suffix == ".csv":
293
  try:
294
  df = pd.read_csv(file_path)
295
- return self._dataframe_to_context(df, file_path.name)
296
- except Exception:
297
- return file_path.read_text(encoding="utf-8", errors="ignore")
298
-
299
- if suffix == ".tsv":
300
- try:
301
- df = pd.read_csv(file_path, sep="\t")
302
- return self._dataframe_to_context(df, file_path.name)
303
  except Exception:
304
  return file_path.read_text(encoding="utf-8", errors="ignore")
305
 
306
  return ""
307
 
308
- def _read_excel_file(self, file_path: Path) -> str:
309
- """
310
- Reads Excel files and formats all sheets into a compact text context.
311
- """
312
- try:
313
- excel_file = pd.ExcelFile(file_path)
314
- sheet_texts = []
315
-
316
- for sheet_name in excel_file.sheet_names:
317
- try:
318
- df = pd.read_excel(file_path, sheet_name=sheet_name)
319
- rendered = self._dataframe_to_context(df, f"{file_path.name}::{sheet_name}")
320
- sheet_texts.append(f"Sheet: {sheet_name}\n{rendered}")
321
- except Exception as e:
322
- sheet_texts.append(f"Sheet: {sheet_name}\n[Unreadable sheet: {e}]")
323
-
324
- return "\n\n".join(sheet_texts).strip()
325
- except Exception as e:
326
- print(f"Excel read error for {file_path.name}: {e}")
327
- return ""
328
-
329
- def _dataframe_to_context(self, df: pd.DataFrame, label: str) -> str:
330
- """
331
- Convert a dataframe into a more LLM-friendly text block.
332
- """
333
- if df is None or df.empty:
334
- return f"File: {label}\n[Empty table]"
335
-
336
- preview_rows = min(len(df), 50)
337
- preview_df = df.head(preview_rows).copy()
338
-
339
- # Normalize NaN for readability
340
- preview_df = preview_df.fillna("")
341
-
342
- columns = ", ".join(str(c) for c in preview_df.columns.tolist())
343
-
344
- return (
345
- f"File: {label}\n"
346
- f"Columns: {columns}\n"
347
- f"Row count preview: {preview_rows}\n"
348
- f"Data preview:\n{preview_df.to_csv(index=False)}"
349
- )
350
-
351
  def _read_extensionless_file(self, file_path: Path) -> str:
352
  """
353
  Try to interpret extensionless files as utf-8 text first.
@@ -381,6 +142,10 @@ class TaskFileTool:
381
 
382
  @staticmethod
383
  def _extract_filename_from_content_disposition(content_disposition: str) -> Optional[str]:
 
 
 
 
384
  if "filename=" not in content_disposition:
385
  return None
386
 
@@ -396,14 +161,11 @@ class TaskFileTool:
396
  "text/plain": ".txt",
397
  "text/csv": ".csv",
398
  "application/csv": ".csv",
399
- "text/tab-separated-values": ".tsv",
400
  "application/json": ".json",
401
  "text/markdown": ".md",
402
  "text/html": ".html",
403
  "application/xml": ".xml",
404
  "text/xml": ".xml",
405
- "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": ".xlsx",
406
- "application/vnd.ms-excel": ".xls",
407
  }
408
 
409
  for key, ext in mapping.items():
@@ -414,4 +176,7 @@ class TaskFileTool:
414
 
415
  @staticmethod
416
  def _safe_filename(filename: str) -> str:
 
 
 
417
  return os.path.basename(filename)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  from __future__ import annotations
2
+ import io
3
  import json
4
  import os
5
  from pathlib import Path
6
  from typing import Optional
 
7
  import pandas as pd
8
  import requests
9
 
 
10
  class TaskFileTool:
11
  """
12
+ Downloads and reads task-linked files from the Hugging Face
13
+ Unit 4 scoring API.
14
 
15
+ Supported text extration:
16
  - txt
17
+ - csv
18
+ - json
19
  - md
20
  - html
21
  - xml
 
 
 
 
 
22
 
23
+ For unsupported or binary files, it safely returns an empty string for now.
24
+ We can extend this later for PDF/images if needed.
25
  """
26
 
27
+ def __init__(self, api_base_url: str, cache_dir:str = "task_files", timeout: int =30):
28
  self.api_base_url = api_base_url.rstrip("/")
29
  self.cache_dir = Path(cache_dir)
30
  self.cache_dir.mkdir(parents=True, exist_ok=True)
31
  self.timeout = timeout
32
+
33
  def get_task_context(self, task_id: str) -> str:
34
  """
35
+ Main entry point used by the agent:
36
  1. download the task file if present
37
  2. read it into text context if supported
38
  """
 
45
  """
46
  Downloads the file linked to a task_id using:
47
  GET /files/{task_id}
48
+
49
+ Returns:
50
+ Path to saved file if successful, else None
51
  """
52
+ url = f"{self.api_base_url}/file/{task_id}"
53
 
54
  try:
55
  response = requests.get(url, timeout=self.timeout)
56
+ except requests.RequestException:
 
57
  return None
58
+
59
+ if response.status_code !=200:
 
60
  return None
61
+
62
  filename = self._infer_filename(response=response, task_id=task_id)
63
  file_path = self.cache_dir / filename
64
 
 
66
  with open(file_path, "wb") as f:
67
  f.write(response.content)
68
  return file_path
69
+ except OSError:
 
70
  return None
71
+ return file_path
72
+
73
  def read_file_as_text(self, file_path: Path) -> str:
74
  """
75
  Reads supported file types into plain text.
 
77
  suffix = file_path.suffix.lower()
78
 
79
  try:
80
+ if suffix in {".txt", ".md", ".html", ".xml", ".csv", ".json"}:
81
  return self._read_supported_text_file(file_path, suffix)
82
 
83
+ # common fallback for files saved without extension but actually text
 
 
84
  if suffix == "":
85
  return self._read_extensionless_file(file_path)
86
 
 
87
  return ""
88
+ except Exception:
 
89
  return ""
90
 
91
  def _read_supported_text_file(self, file_path: Path, suffix: str) -> str:
 
103
  if suffix == ".csv":
104
  try:
105
  df = pd.read_csv(file_path)
106
+ return df.to_csv(index=False)
 
 
 
 
 
 
 
107
  except Exception:
108
  return file_path.read_text(encoding="utf-8", errors="ignore")
109
 
110
  return ""
111
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
112
  def _read_extensionless_file(self, file_path: Path) -> str:
113
  """
114
  Try to interpret extensionless files as utf-8 text first.
 
142
 
143
  @staticmethod
144
  def _extract_filename_from_content_disposition(content_disposition: str) -> Optional[str]:
145
+ """
146
+ Example header:
147
+ content-disposition: attachment; filename="example.csv"
148
+ """
149
  if "filename=" not in content_disposition:
150
  return None
151
 
 
161
  "text/plain": ".txt",
162
  "text/csv": ".csv",
163
  "application/csv": ".csv",
 
164
  "application/json": ".json",
165
  "text/markdown": ".md",
166
  "text/html": ".html",
167
  "application/xml": ".xml",
168
  "text/xml": ".xml",
 
 
169
  }
170
 
171
  for key, ext in mapping.items():
 
176
 
177
  @staticmethod
178
  def _safe_filename(filename: str) -> str:
179
+ """
180
+ Prevent path traversal and weird path issues.
181
+ """
182
  return os.path.basename(filename)