Cmuroc27 commited on
Commit
602af35
·
1 Parent(s): ce06eb3

para leer bien documentos

Browse files
Files changed (2) hide show
  1. app.py +10 -2
  2. tools.py +67 -67
app.py CHANGED
@@ -56,8 +56,16 @@ class BasicAgent:
56
 
57
  answer = asyncio.run(
58
  (asyncio.wait_for(self.agent.run(full_question), timeout=60.0)))
59
-
60
- print(f" Final answer is : {answer[:60]}...")
 
 
 
 
 
 
 
 
61
  return answer
62
 
63
  except asyncio.TimeoutError:
 
56
 
57
  answer = asyncio.run(
58
  (asyncio.wait_for(self.agent.run(full_question), timeout=60.0)))
59
+
60
+ if answer.endswith('.') and len(answer) > 2:
61
+ # Conservar si es "U.S.A." o similar (más de 1 punto)
62
+ if answer.count('.') == 1:
63
+ answer = answer.rstrip('.')
64
+
65
+ # Normalizar comas
66
+ if "," in answer and ", " not in answer:
67
+ answer = answer.replace(", ", ",")
68
+
69
  return answer
70
 
71
  except asyncio.TimeoutError:
tools.py CHANGED
@@ -70,86 +70,86 @@ def get_youtube_transcript(video_url: str) -> str:
70
 
71
  except Exception as e:
72
  return f"Unavailable. Error: {str(e)}"
 
 
73
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
 
 
 
75
 
76
-
77
-
78
- def read_document(file_path: str) -> str:
79
  try:
80
- # 1. Buscar archivo localmente
81
- possible_paths = [
82
- file_path,
83
- os.path.join(".", file_path),
84
- os.path.join("/tmp", file_path),
85
- os.path.join(os.getcwd(), file_path),
86
- ]
87
-
88
- actual_path = None
89
- for path in possible_paths:
90
- if os.path.exists(path):
91
- actual_path = path
92
- break
93
-
94
- # 2. Si no se encuentra, intentar descargar desde GAIA endpoint
95
- if not actual_path:
96
- download_url = f"https://agents-course-unit4-scoring.hf.space/files/{file_path}"
97
- print(f"📥 Attempting to download: {download_url}")
98
- try:
99
- response = requests.get(download_url, timeout=15)
100
- response.raise_for_status()
101
-
102
- # Guardar en archivo temporal
103
- suffix = os.path.splitext(file_path)[1] or ""
104
- with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
105
- tmp.write(response.content)
106
- actual_path = tmp.name
107
- print(f"✅ Downloaded and saved to: {actual_path}")
108
-
109
- except Exception as dl_err:
110
- # Listar archivos disponibles localmente como fallback
111
- try:
112
- available = os.listdir(".")
113
- except:
114
- available = []
115
- return (f"Error: File '{file_path}' not found locally or via download. "
116
- f"Download failed: {str(dl_err)}. "
117
- f"Local files: {', '.join(available[:10])}")
118
-
119
- # 3. Leer el archivo (ya sea local o descargado)
120
- file_ext = os.path.splitext(actual_path)[1].lower()
121
-
122
- # Audio
123
- if file_ext in ['.mp3', '.wav', '.m4a', '.flac', '.ogg']:
124
  transcription = transcribe_audio_openai(actual_path)
125
  result = f"Audio transcription: {transcription}"
126
-
127
- # Documentos soportados
128
- elif file_ext in ['.txt', '.pdf', '.docx', '.csv', '.json', '.md', '.xlsx']:
129
  reader = SimpleDirectoryReader(input_files=[actual_path])
130
- documents = reader.load_data()
131
-
132
- if not documents:
133
- result = "Error: No content found in file"
134
- else:
135
- full_text = "\n\n".join(doc.text for doc in documents)
136
- result = f"File: {os.path.basename(actual_path)}\n\n{full_text}"
137
-
138
  else:
139
- result = f"Error: Unsupported file type: {file_ext} (path: {actual_path})"
140
-
141
- # 4. Limpiar: borrar archivo temporal si se descargó
142
- if actual_path and not any(os.path.samefile(actual_path, p) for p in possible_paths if os.path.exists(p)):
 
 
143
  try:
144
  os.unlink(actual_path)
145
- print(f"🧹 Cleaned up temporary file: {actual_path}")
146
  except:
147
- pass # no hacer nada si falla la limpieza
148
 
149
- return result
150
 
151
- except Exception as e:
152
- return f"Error reading file '{file_path}': {str(e)}"
153
 
154
 
155
 
 
70
 
71
  except Exception as e:
72
  return f"Unavailable. Error: {str(e)}"
73
+
74
+
75
 
76
+ def read_document(file_spec: str) -> str:
77
+ """
78
+ file_spec can be:
79
+ - a local file path (e.g., "data.xlsx")
80
+ - a task_id (e.g., "abc123") to download from GAIA
81
+ """
82
+ import os, requests, tempfile
83
+ from urllib.parse import quote
84
+
85
+ actual_path = None
86
+
87
+ # 1. Si es un task_id (solo letras/números, ~10-12 chars, sin .), asumir que es ID
88
+ if os.path.basename(file_spec).isalnum() and len(file_spec) in range(5, 20) and '.' not in file_spec:
89
+ # Es probablemente un task_id → descargar
90
+ download_url = f"https://agents-course-unit4-scoring.hf.space/files/{quote(file_spec)}"
91
+ print(f"📥 Downloading from: {download_url}")
92
+ try:
93
+ response = requests.get(download_url, timeout=15)
94
+ response.raise_for_status()
95
+
96
+ # Determinar extensión desde Content-Type o por defecto .xlsx
97
+ content_type = response.headers.get('content-type', '').lower()
98
+ if 'pdf' in content_type:
99
+ suffix = '.pdf'
100
+ elif 'excel' in content_type or 'sheet' in content_type:
101
+ suffix = '.xlsx'
102
+ elif 'json' in content_type:
103
+ suffix = '.json'
104
+ elif 'text' in content_type or 'csv' in content_type:
105
+ suffix = '.csv'
106
+ else:
107
+ suffix = '.xlsx' # fallback
108
+
109
+ with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
110
+ tmp.write(response.content)
111
+ actual_path = tmp.name
112
+ print(f"✅ Saved as: {actual_path}")
113
+
114
+ except Exception as e:
115
+ return f"Download failed for task_id '{file_spec}': {str(e)}"
116
+
117
+ else:
118
+ # 2. Buscar localmente
119
+ possible_paths = [file_spec, f"./{file_spec}", f"/tmp/{file_spec}"]
120
+ for p in possible_paths:
121
+ if os.path.exists(p):
122
+ actual_path = p
123
+ break
124
 
125
+ if not actual_path:
126
+ return f"File not found locally or downloadable: {file_spec}"
127
 
128
+ # 3. Leer (tu lógica existente)
 
 
129
  try:
130
+ ext = os.path.splitext(actual_path)[1].lower()
131
+ if ext in ['.mp3', '.wav', '.m4a', '.flac', '.ogg']:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
132
  transcription = transcribe_audio_openai(actual_path)
133
  result = f"Audio transcription: {transcription}"
134
+ elif ext in ['.txt', '.pdf', '.docx', '.csv', '.json', '.md', '.xlsx']:
 
 
135
  reader = SimpleDirectoryReader(input_files=[actual_path])
136
+ docs = reader.load_data()
137
+ full_text = "\n\n".join(doc.text for doc in docs) if docs else ""
138
+ result = f"File: {os.path.basename(actual_path)}\n\n{full_text}"
 
 
 
 
 
139
  else:
140
+ result = f"Unsupported file type: {ext}"
141
+ except Exception as e:
142
+ result = f"Error reading {actual_path}: {str(e)}"
143
+ finally:
144
+ # Limpiar temporal
145
+ if actual_path and actual_path not in possible_paths:
146
  try:
147
  os.unlink(actual_path)
 
148
  except:
149
+ pass
150
 
151
+ return result
152
 
 
 
153
 
154
 
155