Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -19,14 +19,21 @@ except Exception as e:
|
|
| 19 |
|
| 20 |
def summarize_file(file):
|
| 21 |
start = time.time()
|
| 22 |
-
|
|
|
|
|
|
|
| 23 |
try:
|
| 24 |
-
file_bytes = file.read()
|
| 25 |
-
|
|
|
|
|
|
|
| 26 |
text = ""
|
| 27 |
if mime == 'application/pdf':
|
| 28 |
-
|
| 29 |
-
|
|
|
|
|
|
|
|
|
|
| 30 |
elif mime in ['text/plain', 'text/rtf']:
|
| 31 |
text = rtf_to_text(file_bytes.decode("utf-8", errors="ignore")) if mime == 'text/rtf' else file_bytes.decode("utf-8", errors="ignore")
|
| 32 |
elif mime in ['text/csv', 'application/vnd.ms-excel']:
|
|
@@ -42,14 +49,16 @@ def summarize_file(file):
|
|
| 42 |
text = " ".join(df.astype(str).values.flatten())
|
| 43 |
else:
|
| 44 |
text = textract.process(file_bytes).decode("utf-8", errors="ignore")
|
|
|
|
|
|
|
| 45 |
text = re.sub(r"\$\s*([^$]+)\s*\$", r"\1", text)
|
| 46 |
text = re.sub(r"\\cap", "intersection", text)
|
| 47 |
text = re.sub(r"\s+", " ", text).strip()
|
| 48 |
-
text
|
|
|
|
| 49 |
print(f"Extracted chars: {len(text)}")
|
| 50 |
except Exception as e:
|
| 51 |
return f"β Text extraction failed: {str(e)}"
|
| 52 |
-
if not text.strip(): return "β No text found"
|
| 53 |
text = text[:300000]
|
| 54 |
chunks = [text[i:i+1000] for i in range(0, len(text), 1000)]
|
| 55 |
print(f"Chunks created: {len(chunks)}")
|
|
@@ -67,7 +76,7 @@ def summarize_file(file):
|
|
| 67 |
summaries.append(f"**Chunk {i+1}**:\n{summary}")
|
| 68 |
except Exception as e:
|
| 69 |
summaries.append(f"**Chunk {i+1}**: β Error: {str(e)}")
|
| 70 |
-
# Pad
|
| 71 |
while len(summaries) < 12:
|
| 72 |
summaries.append(f"**Chunk {len(summaries)+1}**: Insufficient content for full summary")
|
| 73 |
return f"**Chars**: {len(text)}\n**Time**: {time.time()-start:.2f}s\n\n" + "\n\n".join(summaries[:12])
|
|
|
|
| 19 |
|
| 20 |
def summarize_file(file):
|
| 21 |
start = time.time()
|
| 22 |
+
if not hasattr(file, 'read') or not hasattr(file, 'name'):
|
| 23 |
+
return "β Invalid file: Missing read() or name attribute"
|
| 24 |
+
print(f"File: {file.name}")
|
| 25 |
try:
|
| 26 |
+
file_bytes = file.read()
|
| 27 |
+
if not isinstance(file_bytes, bytes) or len(file_bytes) == 0:
|
| 28 |
+
return "β Invalid file: Empty or non-binary content"
|
| 29 |
+
mime, _ = mimetypes.guess_type(file.name) or ('text/plain', None)
|
| 30 |
text = ""
|
| 31 |
if mime == 'application/pdf':
|
| 32 |
+
try:
|
| 33 |
+
doc = fitz.open(stream=file_bytes, filetype="pdf")
|
| 34 |
+
text = "".join(page.get_text("text") for page in doc)
|
| 35 |
+
except:
|
| 36 |
+
return "β PDF parsing failed"
|
| 37 |
elif mime in ['text/plain', 'text/rtf']:
|
| 38 |
text = rtf_to_text(file_bytes.decode("utf-8", errors="ignore")) if mime == 'text/rtf' else file_bytes.decode("utf-8", errors="ignore")
|
| 39 |
elif mime in ['text/csv', 'application/vnd.ms-excel']:
|
|
|
|
| 49 |
text = " ".join(df.astype(str).values.flatten())
|
| 50 |
else:
|
| 51 |
text = textract.process(file_bytes).decode("utf-8", errors="ignore")
|
| 52 |
+
# Strict text cleaning
|
| 53 |
+
text = re.sub(r"[^\x20-\x7E]", "", text) # Keep printable ASCII only
|
| 54 |
text = re.sub(r"\$\s*([^$]+)\s*\$", r"\1", text)
|
| 55 |
text = re.sub(r"\\cap", "intersection", text)
|
| 56 |
text = re.sub(r"\s+", " ", text).strip()
|
| 57 |
+
if not text or len(text) < 100 or sum(1 for c in text if c.isalnum()) < 50:
|
| 58 |
+
return "β Extracted text invalid or too short"
|
| 59 |
print(f"Extracted chars: {len(text)}")
|
| 60 |
except Exception as e:
|
| 61 |
return f"β Text extraction failed: {str(e)}"
|
|
|
|
| 62 |
text = text[:300000]
|
| 63 |
chunks = [text[i:i+1000] for i in range(0, len(text), 1000)]
|
| 64 |
print(f"Chunks created: {len(chunks)}")
|
|
|
|
| 76 |
summaries.append(f"**Chunk {i+1}**:\n{summary}")
|
| 77 |
except Exception as e:
|
| 78 |
summaries.append(f"**Chunk {i+1}**: β Error: {str(e)}")
|
| 79 |
+
# Pad to 12 summaries
|
| 80 |
while len(summaries) < 12:
|
| 81 |
summaries.append(f"**Chunk {len(summaries)+1}**: Insufficient content for full summary")
|
| 82 |
return f"**Chars**: {len(text)}\n**Time**: {time.time()-start:.2f}s\n\n" + "\n\n".join(summaries[:12])
|