Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -69,39 +69,49 @@ class SerpClient:
|
|
| 69 |
# ---------------------------
|
| 70 |
# Safe file text extraction (Gradio returns FileData dict: {"name", "size", "path"})
|
| 71 |
# ---------------------------
|
| 72 |
-
def extract_text_from_gradio_file(filedata
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 73 |
if not filedata:
|
| 74 |
return ""
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
if
|
| 78 |
-
|
| 79 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 80 |
try:
|
| 81 |
-
lower = file_path.lower()
|
| 82 |
if lower.endswith(".txt"):
|
| 83 |
-
|
| 84 |
-
|
| 85 |
if lower.endswith(".pdf"):
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
except Exception:
|
| 91 |
-
# fallback: try binary read and decode
|
| 92 |
-
with open(file_path, "rb") as f:
|
| 93 |
-
return f.read().decode("utf-8", errors="ignore")
|
| 94 |
if lower.endswith(".docx"):
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
with open(file_path, "rb") as f:
|
| 101 |
-
return f.read().decode("utf-8", errors="ignore")
|
| 102 |
-
# Fallback: read bytes
|
| 103 |
with open(file_path, "rb") as f:
|
| 104 |
return f.read().decode("utf-8", errors="ignore")
|
|
|
|
| 105 |
except Exception:
|
| 106 |
return ""
|
| 107 |
|
|
|
|
| 69 |
# ---------------------------
|
| 70 |
# Safe file text extraction (Gradio returns FileData dict: {"name", "size", "path"})
|
| 71 |
# ---------------------------
|
| 72 |
+
def extract_text_from_gradio_file(filedata):
|
| 73 |
+
"""
|
| 74 |
+
Supports BOTH:
|
| 75 |
+
1. HF Spaces dict:
|
| 76 |
+
{"name": "..", "path": "...", "size": ...}
|
| 77 |
+
2. HF NamedString:
|
| 78 |
+
filedata = NamedString("/tmp/.../file.txt")
|
| 79 |
+
"""
|
| 80 |
if not filedata:
|
| 81 |
return ""
|
| 82 |
+
|
| 83 |
+
# Case A: filedata is a dict
|
| 84 |
+
if isinstance(filedata, dict):
|
| 85 |
+
file_path = filedata.get("path") or filedata.get("name")
|
| 86 |
+
if not file_path:
|
| 87 |
+
return ""
|
| 88 |
+
file_path = str(file_path)
|
| 89 |
+
|
| 90 |
+
# Case B: filedata is NamedString (just a string)
|
| 91 |
+
else:
|
| 92 |
+
# Gradio NamedString gives direct file path
|
| 93 |
+
file_path = str(filedata)
|
| 94 |
+
|
| 95 |
+
lower = file_path.lower()
|
| 96 |
+
|
| 97 |
try:
|
|
|
|
| 98 |
if lower.endswith(".txt"):
|
| 99 |
+
return open(file_path, "r", encoding="utf-8", errors="ignore").read()
|
| 100 |
+
|
| 101 |
if lower.endswith(".pdf"):
|
| 102 |
+
from pypdf import PdfReader
|
| 103 |
+
reader = PdfReader(file_path)
|
| 104 |
+
return "\n".join([p.extract_text() or "" for p in reader.pages])
|
| 105 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
| 106 |
if lower.endswith(".docx"):
|
| 107 |
+
import docx
|
| 108 |
+
doc = docx.Document(file_path)
|
| 109 |
+
return "\n".join([p.text for p in doc.paragraphs])
|
| 110 |
+
|
| 111 |
+
# fallback
|
|
|
|
|
|
|
|
|
|
| 112 |
with open(file_path, "rb") as f:
|
| 113 |
return f.read().decode("utf-8", errors="ignore")
|
| 114 |
+
|
| 115 |
except Exception:
|
| 116 |
return ""
|
| 117 |
|