Spaces:

tejovanth
/

examplethree

Sleeping

App Files Files Community

tejovanth commited on Apr 18, 2025

Commit

47a8c64

verified ·

1 Parent(s): 1ae4c5e

Update app.py

Browse files

Files changed (1) hide show

app.py +68 -64

app.py CHANGED Viewed

@@ -1,110 +1,114 @@
 import gradio as gr
-import fitz  # PyMuPDF
 import torch
 from transformers import pipeline
-import time, logging
-import re
 import tempfile
-# === Setup Logging and Device ===
 logging.basicConfig(level=logging.ERROR)
-device = -1  # CPU
-print("⚠️ CPU-only mode. Expect ~20–30s for large documents.")
-# === Load Summarization Model ===
 try:
     summarizer = pipeline("summarization", model="t5-small", device=device, torch_dtype=torch.float32)
 except Exception as e:
-    print(f"❌ Model loading failed: {e}")
     exit(1)
-# === Sentence-based Chunking ===
-def smart_chunk(text, max_chunk_len=2000):
-    sentences = re.split(r'(?<=[.!?]) +', text)
-    chunks, current_chunk = [], ""
-    for sentence in sentences:
-        if len(current_chunk) + len(sentence) < max_chunk_len:
-            current_chunk += sentence + " "
-        else:
-            chunks.append(current_chunk.strip())
-            current_chunk = sentence + " "
-    if current_chunk:
-        chunks.append(current_chunk.strip())
-    return chunks
-# === Summarization for One File ===
 def summarize_file_bytes(file_bytes, filename):
-    start_time = time.time()
     try:
-        if file_bytes[:4].startswith(b'%PDF'):
-            text = "".join(page.get_text("text", flags=16) for page in fitz.open(stream=file_bytes, filetype="pdf"))
         else:
-            text = file_bytes.decode("utf-8", errors="ignore")
     except Exception as e:
-        return f"{filename}: ❌ Text extraction failed: {e}", ""
-    text = text.strip()
-    if not text:
-        return f"{filename}: ❌ No text found.", ""
     text = text[:300000]
-    chunks = smart_chunk(text)
-    summaries, line_count = [], 0
-    for i, chunk in enumerate(chunks):
-        if time.time() - start_time > 20:
-            summaries.append("⚠️ Stopped early due to time limit.")
-            break
         try:
-            summary = summarizer(chunk, max_length=100, min_length=30, do_sample=False)[0]["summary_text"]
-            summaries.append(f"**Chunk {i+1}**:\n{summary.strip()}")
-            line_count += summary.count('\n') + 1
-            if line_count >= 15:
-                break
         except Exception as e:
-            summaries.append(f"**Chunk {i+1}**: ❌ Error summarizing: {e}")
-    total_time = time.time() - start_time
-    summary_text = f"📄 **{filename}**\n**Characters**: {len(text)} | **Time**: {total_time:.2f}s\n\n" + "\n\n".join(summaries)
     return summary_text, summary_text
-# === Multiple Files Handler ===
-def summarize_multiple_files(file_objs):
     all_summaries = []
     combined_text = ""
-    for file_bytes, file_info in file_objs:
-        filename = file_info['name'].split("/")[-1]
         summary, raw_text = summarize_file_bytes(file_bytes, filename)
         all_summaries.append(summary)
         combined_text += f"\n\n{raw_text}\n" + "="*60 + "\n"
-    # Save combined summary to a temp .txt file
     with tempfile.NamedTemporaryFile(delete=False, suffix=".txt", mode="w", encoding="utf-8") as f:
         f.write(combined_text)
         summary_file_path = f.name
     return "\n\n".join(all_summaries), summary_file_path
-# === Gradio Interface ===
 demo = gr.Interface(
     fn=summarize_multiple_files,
-    inputs=gr.File(label="📄 Upload PDF or TXT files", file_types=[".pdf", ".txt"], type="binary", file_count="multiple"),
     outputs=[
-        gr.Textbox(label="📝 Summary", lines=30, max_lines=100),
         gr.File(label="📥 Download Summary as .txt")
     ],
     title="📚 Multi-File Summarizer",
-    description="Summarizes multiple PDFs or TXTs into at least 15 lines each. Download final output as .txt. CPU-optimized."
 )
-# === Launch App ===
 if __name__ == "__main__":
     try:
         demo.launch(share=False, server_port=7860)
     except Exception as e:
-        print(f"❌ Gradio launch failed: {e}")

 import gradio as gr
+import fitz
 import torch
 from transformers import pipeline
+import time, logging, re, pandas as pd, docx, pytesseract, openpyxl, textract, mimetypes
+from PIL import Image
+from io import BytesIO
+from striprtf.striprtf import rtf_to_text
 import tempfile
 logging.basicConfig(level=logging.ERROR)
+device = -1  # CPU-only
+print("⚠️ CPU-only. Expect ~5–9s for 300,000 chars.")
 try:
     summarizer = pipeline("summarization", model="t5-small", device=device, torch_dtype=torch.float32)
 except Exception as e:
+    print(f"❌ Model loading failed: {str(e)}")
     exit(1)
 def summarize_file_bytes(file_bytes, filename):
+    start = time.time()
     try:
+        if not isinstance(file_bytes, bytes) or len(file_bytes) == 0:
+            return f"❌ {filename}: Invalid or empty file", ""
+        mime, _ = mimetypes.guess_type(filename) or ('text/plain', None)
+        text = ""
+        if mime == 'application/pdf':
+            doc = fitz.open(stream=file_bytes, filetype="pdf")
+            text = "".join(page.get_text("text") for page in doc)
+        elif mime in ['text/plain', 'text/rtf']:
+            text = rtf_to_text(file_bytes.decode("utf-8", errors="ignore")) if mime == 'text/rtf' else file_bytes.decode("utf-8", errors="ignore")
+        elif mime in ['text/csv', 'application/vnd.ms-excel']:
+            text = " ".join(pd.read_csv(BytesIO(file_bytes)).astype(str).values.flatten())
+        elif mime == 'application/vnd.openxmlformats-officedocument.wordprocessingml.document':
+            doc = docx.Document(BytesIO(file_bytes))
+            text = " ".join(p.text for p in doc.paragraphs if p.text)
+        elif mime in ['image/jpeg', 'image/png']:
+            img = Image.open(BytesIO(file_bytes)).convert('L').resize((int(img.width * 300 / img.height), 300))
+            text = pytesseract.image_to_string(img)
+        elif mime == 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet':
+            df = pd.read_excel(BytesIO(file_bytes), engine='openpyxl')
+            text = " ".join(df.astype(str).values.flatten())
         else:
+            text = textract.process(file_bytes).decode("utf-8", errors="ignore")
+        text = re.sub(r"[^\x20-\x7E]", "", text)  # Printable ASCII only
+        text = re.sub(r"\$\s*([^$]+)\s*\$", r"\1", text)
+        text = re.sub(r"\\cap", "intersection", text)
+        text = re.sub(r"\s+", " ", text).strip()
+        if not text or len(text) < 100 or sum(1 for c in text if c.isalnum()) < 50:
+            return f"❌ {filename}: Invalid or too short text", ""
+        print(f"Extracted chars for {filename}: {len(text)}")
     except Exception as e:
+        return f"❌ {filename}: Text extraction failed: {str(e)}", ""
     text = text[:300000]
+    chunks = [text[i:i+1000] for i in range(0, len(text), 1000)]
+    print(f"Chunks for {filename}: {len(chunks)}")
+    if not chunks:
+        return f"❌ {filename}: No chunks to summarize", ""
+    selected_indices = [int(i * len(chunks) / 12) for i in range(12)] if len(chunks) >= 12 else list(range(len(chunks)))
+    summaries = []
+    for i in selected_indices:
+        chunk = chunks[i]
+        if sum(1 for c in chunk if not c.isalnum()) / len(chunk) > 0.7:
+            summaries.append(f"**Chunk {i+1}**: Skipped (equation-heavy)")
+            continue
         try:
+            summary = summarizer(chunk, max_length=40, min_length=10, do_sample=False)[0]['summary_text']
+            summaries.append(f"**Chunk {i+1}**:\n{summary}")
         except Exception as e:
+            summaries.append(f"**Chunk {i+1}**: ❌ Error: {str(e)}")
+    while len(summaries) < 12:
+        summaries.append(f"**Chunk {len(summaries)+1}**: Insufficient content")
+    summary_text = f"📄 **{filename}**\n**Chars**: {len(text)}\n**Time**: {time.time()-start:.2f}s\n\n" + "\n\n".join(summaries[:12])
     return summary_text, summary_text
+def summarize_multiple_files(*file_objs):
+    if not file_objs or not any(file_objs):
+        return "❌ No files uploaded", None
     all_summaries = []
     combined_text = ""
+    for file in file_objs[0] if isinstance(file_objs[0], list) else file_objs:
+        if not hasattr(file, 'read') or not hasattr(file, 'name'):
+            all_summaries.append(f"❌ Invalid file: Missing read() or name")
+            continue
+        filename = file.name.split("/")[-1]
+        file_bytes = file.read()
         summary, raw_text = summarize_file_bytes(file_bytes, filename)
         all_summaries.append(summary)
         combined_text += f"\n\n{raw_text}\n" + "="*60 + "\n"
     with tempfile.NamedTemporaryFile(delete=False, suffix=".txt", mode="w", encoding="utf-8") as f:
         f.write(combined_text)
         summary_file_path = f.name
     return "\n\n".join(all_summaries), summary_file_path
 demo = gr.Interface(
     fn=summarize_multiple_files,
+    inputs=gr.File(label="📄 Upload Any File", type="binary", file_count="multiple"),
     outputs=[
+        gr.Textbox(label="📝 Summary", lines=15, max_lines=100),
         gr.File(label="📥 Download Summary as .txt")
     ],
     title="📚 Multi-File Summarizer",
+    description="Summarizes any file into exactly 15 lines. Download as .txt. ~5–9s for 300,000 chars (CPU)."
 )
 if __name__ == "__main__":
     try:
         demo.launch(share=False, server_port=7860)
     except Exception as e:
+        print(f"❌ Gradio launch failed: {str(e)}")