Spaces:

tejovanth
/

examplethree

Sleeping

App Files Files Community

tejovanth commited on Apr 18, 2025

Commit

8549f68

verified ·

1 Parent(s): 24d9906

Update app.py

Browse files

Files changed (1) hide show

app.py +17 -8

app.py CHANGED Viewed

@@ -19,14 +19,21 @@ except Exception as e:
 def summarize_file(file):
     start = time.time()
-    print(f"File: {file.name if hasattr(file, 'name') else 'unknown'}")
     try:
-        file_bytes = file.read() if hasattr(file, 'read') else file
-        mime, _ = mimetypes.guess_type(file.name) if hasattr(file, 'name') else (None, None)
         text = ""
         if mime == 'application/pdf':
-            doc = fitz.open(stream=file_bytes, filetype="pdf")
-            text = "".join(page.get_text("text") for page in doc)
         elif mime in ['text/plain', 'text/rtf']:
             text = rtf_to_text(file_bytes.decode("utf-8", errors="ignore")) if mime == 'text/rtf' else file_bytes.decode("utf-8", errors="ignore")
         elif mime in ['text/csv', 'application/vnd.ms-excel']:
@@ -42,14 +49,16 @@ def summarize_file(file):
             text = " ".join(df.astype(str).values.flatten())
         else:
             text = textract.process(file_bytes).decode("utf-8", errors="ignore")
         text = re.sub(r"\$\s*([^$]+)\s*\$", r"\1", text)
         text = re.sub(r"\\cap", "intersection", text)
         text = re.sub(r"\s+", " ", text).strip()
-        text = "".join(c for c in text if ord(c) < 128)
         print(f"Extracted chars: {len(text)}")
     except Exception as e:
         return f"❌ Text extraction failed: {str(e)}"
-    if not text.strip(): return "❌ No text found"
     text = text[:300000]
     chunks = [text[i:i+1000] for i in range(0, len(text), 1000)]
     print(f"Chunks created: {len(chunks)}")
@@ -67,7 +76,7 @@ def summarize_file(file):
             summaries.append(f"**Chunk {i+1}**:\n{summary}")
         except Exception as e:
             summaries.append(f"**Chunk {i+1}**: ❌ Error: {str(e)}")
-    # Pad if <12 summaries
     while len(summaries) < 12:
         summaries.append(f"**Chunk {len(summaries)+1}**: Insufficient content for full summary")
     return f"**Chars**: {len(text)}\n**Time**: {time.time()-start:.2f}s\n\n" + "\n\n".join(summaries[:12])

 def summarize_file(file):
     start = time.time()
+    if not hasattr(file, 'read') or not hasattr(file, 'name'):
+        return "❌ Invalid file: Missing read() or name attribute"
+    print(f"File: {file.name}")
     try:
+        file_bytes = file.read()
+        if not isinstance(file_bytes, bytes) or len(file_bytes) == 0:
+            return "❌ Invalid file: Empty or non-binary content"
+        mime, _ = mimetypes.guess_type(file.name) or ('text/plain', None)
         text = ""
         if mime == 'application/pdf':
+            try:
+                doc = fitz.open(stream=file_bytes, filetype="pdf")
+                text = "".join(page.get_text("text") for page in doc)
+            except:
+                return "❌ PDF parsing failed"
         elif mime in ['text/plain', 'text/rtf']:
             text = rtf_to_text(file_bytes.decode("utf-8", errors="ignore")) if mime == 'text/rtf' else file_bytes.decode("utf-8", errors="ignore")
         elif mime in ['text/csv', 'application/vnd.ms-excel']:
             text = " ".join(df.astype(str).values.flatten())
         else:
             text = textract.process(file_bytes).decode("utf-8", errors="ignore")
+        # Strict text cleaning
+        text = re.sub(r"[^\x20-\x7E]", "", text)  # Keep printable ASCII only
         text = re.sub(r"\$\s*([^$]+)\s*\$", r"\1", text)
         text = re.sub(r"\\cap", "intersection", text)
         text = re.sub(r"\s+", " ", text).strip()
+        if not text or len(text) < 100 or sum(1 for c in text if c.isalnum()) < 50:
+            return "❌ Extracted text invalid or too short"
         print(f"Extracted chars: {len(text)}")
     except Exception as e:
         return f"❌ Text extraction failed: {str(e)}"
     text = text[:300000]
     chunks = [text[i:i+1000] for i in range(0, len(text), 1000)]
     print(f"Chunks created: {len(chunks)}")
             summaries.append(f"**Chunk {i+1}**:\n{summary}")
         except Exception as e:
             summaries.append(f"**Chunk {i+1}**: ❌ Error: {str(e)}")
+    # Pad to 12 summaries
     while len(summaries) < 12:
         summaries.append(f"**Chunk {len(summaries)+1}**: Insufficient content for full summary")
     return f"**Chars**: {len(text)}\n**Time**: {time.time()-start:.2f}s\n\n" + "\n\n".join(summaries[:12])