Toulik commited on
Commit
95e19a1
·
verified ·
1 Parent(s): 3c29cfd

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +85 -15
app.py CHANGED
@@ -126,23 +126,98 @@ def call_gpt5_for_metadata(title: str, short_text: str, top_chunks: List[str]) -
126
  data = {"_parsing_error": True, "raw_output": text}
127
  return data
128
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
129
 
130
  # ----------------------
131
  # Main processing function
132
  # ----------------------
 
133
  def process_file(file_obj) -> Dict[str, Any]:
134
  """
135
- file_obj: the uploaded file object provided by Gradio; has .name and a .file-like interface
136
  Returns metadata dict ready to display.
137
  """
138
- # Save uploaded file to temporary path
139
- with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(file_obj.name)[1]) as tmp:
140
- tmp.write(file_obj.read())
141
- tmp_path = tmp.name
142
 
143
- # Extract text
144
  try:
145
- if file_obj.name.lower().endswith(".pdf"):
146
  extracted_text = extract_text_from_pdf(tmp_path)
147
  else:
148
  extracted_text = extract_text_from_image(tmp_path)
@@ -154,27 +229,22 @@ def process_file(file_obj) -> Dict[str, Any]:
154
 
155
  # Chunk and pick top chunks
156
  chunks = chunk_text(extracted_text)
157
- # Heuristic: pick longest chunks as representative
158
  sorted_chunks = sorted(chunks, key=lambda x: len(x), reverse=True)
159
  top_chunks = sorted_chunks[:6] if sorted_chunks else [extracted_text[:2000]]
160
 
161
- # Prepare a "short_text" to feed to the LLM
162
  short_text = (extracted_text[:1000] + "...") if len(extracted_text) > 1000 else extracted_text
163
 
164
- # Call LLM
165
- metadata = call_gpt5_for_metadata(file_obj.name, short_text, top_chunks)
166
 
167
- # If LLM returned a parsing error, include it
168
  if metadata.get("_parsing_error"):
169
  return {
170
  "error": "LLM output parsing failed. See raw_output.",
171
  "raw_output": metadata.get("raw_output")
172
  }
173
 
174
- # Ensure required keys exist and post-process small things
175
  now = datetime.datetime.now(datetime.timezone.utc).astimezone().isoformat()
176
- metadata.setdefault("doc_id", os.path.splitext(file_obj.name)[0])
177
- metadata.setdefault("title", file_obj.name)
178
  metadata.setdefault("source", "user_upload")
179
  metadata.setdefault("raw_url", "")
180
  metadata.setdefault("ingest_timestamp", now)
 
126
  data = {"_parsing_error": True, "raw_output": text}
127
  return data
128
 
129
+ # helper: accept multiple upload types and return saved temp path and original name
130
+ def save_uploaded_to_tmp(file_obj):
131
+ """
132
+ Accepts:
133
+ - a file-like object with .read()
134
+ - a path string (existing file path)
135
+ - a dict-like object returned by some gradio versions: {"name": "...", "data": b'...'}
136
+ - a NamedTemporaryFile wrapper (sometimes behaves like a path string)
137
+
138
+ Returns (tmp_path, original_name)
139
+ """
140
+ import io
141
+
142
+ # Case 1: file-like object with .read()
143
+ if hasattr(file_obj, "read") and callable(getattr(file_obj, "read")):
144
+ try:
145
+ content = file_obj.read()
146
+ # some wrappers return str, ensure bytes
147
+ if isinstance(content, str):
148
+ content = content.encode("utf-8")
149
+ name = getattr(file_obj, "name", "uploaded_file")
150
+ suffix = os.path.splitext(name)[1] or ""
151
+ with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
152
+ tmp.write(content)
153
+ return tmp.name, os.path.basename(name)
154
+ except Exception:
155
+ # fallthrough to other handlers
156
+ pass
157
+
158
+ # Case 2: Gradio sometimes returns a dict-like object with 'name' and 'data'
159
+ if isinstance(file_obj, dict):
160
+ # some versions: {"name": "foo.pdf", "data": b'...'}
161
+ if "data" in file_obj and "name" in file_obj:
162
+ data = file_obj["data"]
163
+ if isinstance(data, str):
164
+ data = data.encode("utf-8")
165
+ name = file_obj["name"]
166
+ suffix = os.path.splitext(name)[1] or ""
167
+ with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
168
+ tmp.write(data)
169
+ return tmp.name, os.path.basename(name)
170
+
171
+ # Case 3: file_obj is a path string
172
+ if isinstance(file_obj, str):
173
+ # if it's an existing path, just return it
174
+ if os.path.exists(file_obj):
175
+ return file_obj, os.path.basename(file_obj)
176
+ # sometimes gradio passes a NamedString that can be opened as a path -- try to open it
177
+ try:
178
+ with open(file_obj, "rb") as f:
179
+ data = f.read()
180
+ suffix = os.path.splitext(file_obj)[1] or ""
181
+ with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
182
+ tmp.write(data)
183
+ return tmp.name, os.path.basename(file_obj)
184
+ except Exception:
185
+ pass
186
+
187
+ # Case 4: some wrappers expose .name but not .read (e.g., NamedString)
188
+ name = getattr(file_obj, "name", None)
189
+ if name and isinstance(name, str):
190
+ try:
191
+ with open(name, "rb") as f:
192
+ data = f.read()
193
+ suffix = os.path.splitext(name)[1] or ""
194
+ with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
195
+ tmp.write(data)
196
+ return tmp.name, os.path.basename(name)
197
+ except Exception:
198
+ pass
199
+
200
+ # If we reach here, we can't handle the object
201
+ raise ValueError(f"Unsupported uploaded file object type: {type(file_obj)}. Value: {str(file_obj)[:200]}")
202
+
203
 
204
  # ----------------------
205
  # Main processing function
206
  # ----------------------
207
+ # Updated process_file using the helper above
208
  def process_file(file_obj) -> Dict[str, Any]:
209
  """
210
+ file_obj: whatever gradio handed to us (file-like, dict, path string, etc.)
211
  Returns metadata dict ready to display.
212
  """
213
+ try:
214
+ tmp_path, orig_name = save_uploaded_to_tmp(file_obj)
215
+ except Exception as e:
216
+ return {"error": f"Failed to save uploaded file: {e}"}
217
 
218
+ # Now use tmp_path and orig_name for the rest of the pipeline
219
  try:
220
+ if orig_name.lower().endswith(".pdf"):
221
  extracted_text = extract_text_from_pdf(tmp_path)
222
  else:
223
  extracted_text = extract_text_from_image(tmp_path)
 
229
 
230
  # Chunk and pick top chunks
231
  chunks = chunk_text(extracted_text)
 
232
  sorted_chunks = sorted(chunks, key=lambda x: len(x), reverse=True)
233
  top_chunks = sorted_chunks[:6] if sorted_chunks else [extracted_text[:2000]]
234
 
 
235
  short_text = (extracted_text[:1000] + "...") if len(extracted_text) > 1000 else extracted_text
236
 
237
+ metadata = call_gpt5_for_metadata(orig_name, short_text, top_chunks)
 
238
 
 
239
  if metadata.get("_parsing_error"):
240
  return {
241
  "error": "LLM output parsing failed. See raw_output.",
242
  "raw_output": metadata.get("raw_output")
243
  }
244
 
 
245
  now = datetime.datetime.now(datetime.timezone.utc).astimezone().isoformat()
246
+ metadata.setdefault("doc_id", os.path.splitext(orig_name)[0])
247
+ metadata.setdefault("title", orig_name)
248
  metadata.setdefault("source", "user_upload")
249
  metadata.setdefault("raw_url", "")
250
  metadata.setdefault("ingest_timestamp", now)