Spaces:

tejovanth
/

examplethree

Sleeping

App Files Files Community

tejovanth commited on Apr 18, 2025

Commit

53883a6

verified ·

1 Parent(s): 8549f68

Delete app.py

Browse files

Files changed (1) hide show

app.py +0 -94

app.py DELETED Viewed

@@ -1,94 +0,0 @@
-import gradio as gr
-import fitz
-import torch
-from transformers import pipeline
-import time, logging, re, pandas as pd, docx, pytesseract, openpyxl, textract, mimetypes
-from PIL import Image
-from io import BytesIO
-from striprtf.striprtf import rtf_to_text
-logging.basicConfig(level=logging.ERROR)
-device = -1  # CPU-only
-print("⚠️ CPU-only. Expect ~10–15s for 300,000 chars.")
-try:
-    summarizer = pipeline("summarization", model="t5-small", device=device, torch_dtype=torch.float32)
-except Exception as e:
-    print(f"❌ Model loading failed: {str(e)}")
-    exit(1)
-def summarize_file(file):
-    start = time.time()
-    if not hasattr(file, 'read') or not hasattr(file, 'name'):
-        return "❌ Invalid file: Missing read() or name attribute"
-    print(f"File: {file.name}")
-    try:
-        file_bytes = file.read()
-        if not isinstance(file_bytes, bytes) or len(file_bytes) == 0:
-            return "❌ Invalid file: Empty or non-binary content"
-        mime, _ = mimetypes.guess_type(file.name) or ('text/plain', None)
-        text = ""
-        if mime == 'application/pdf':
-            try:
-                doc = fitz.open(stream=file_bytes, filetype="pdf")
-                text = "".join(page.get_text("text") for page in doc)
-            except:
-                return "❌ PDF parsing failed"
-        elif mime in ['text/plain', 'text/rtf']:
-            text = rtf_to_text(file_bytes.decode("utf-8", errors="ignore")) if mime == 'text/rtf' else file_bytes.decode("utf-8", errors="ignore")
-        elif mime in ['text/csv', 'application/vnd.ms-excel']:
-            text = " ".join(pd.read_csv(BytesIO(file_bytes)).astype(str).values.flatten())
-        elif mime == 'application/vnd.openxmlformats-officedocument.wordprocessingml.document':
-            doc = docx.Document(BytesIO(file_bytes))
-            text = " ".join(p.text for p in doc.paragraphs if p.text)
-        elif mime in ['image/jpeg', 'image/png']:
-            img = Image.open(BytesIO(file_bytes)).convert('L').resize((int(img.width * 300 / img.height), 300))
-            text = pytesseract.image_to_string(img)
-        elif mime == 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet':
-            df = pd.read_excel(BytesIO(file_bytes), engine='openpyxl')
-            text = " ".join(df.astype(str).values.flatten())
-        else:
-            text = textract.process(file_bytes).decode("utf-8", errors="ignore")
-        # Strict text cleaning
-        text = re.sub(r"[^\x20-\x7E]", "", text)  # Keep printable ASCII only
-        text = re.sub(r"\$\s*([^$]+)\s*\$", r"\1", text)
-        text = re.sub(r"\\cap", "intersection", text)
-        text = re.sub(r"\s+", " ", text).strip()
-        if not text or len(text) < 100 or sum(1 for c in text if c.isalnum()) < 50:
-            return "❌ Extracted text invalid or too short"
-        print(f"Extracted chars: {len(text)}")
-    except Exception as e:
-        return f"❌ Text extraction failed: {str(e)}"
-    text = text[:300000]
-    chunks = [text[i:i+1000] for i in range(0, len(text), 1000)]
-    print(f"Chunks created: {len(chunks)}")
-    if not chunks: return "❌ No chunks to summarize"
-    # Select 12 chunks evenly spaced
-    selected_indices = [int(i * len(chunks) / 12) for i in range(12)] if len(chunks) >= 12 else list(range(len(chunks)))
-    summaries = []
-    for i in selected_indices:
-        chunk = chunks[i]
-        if sum(1 for c in chunk if not c.isalnum()) / len(chunk) > 0.7:
-            summaries.append(f"**Chunk {i+1}**: Skipped (equation-heavy)")
-            continue
-        try:
-            summary = summarizer(chunk, max_length=40, min_length=10, do_sample=False)[0]['summary_text']
-            summaries.append(f"**Chunk {i+1}**:\n{summary}")
-        except Exception as e:
-            summaries.append(f"**Chunk {i+1}**: ❌ Error: {str(e)}")
-    # Pad to 12 summaries
-    while len(summaries) < 12:
-        summaries.append(f"**Chunk {len(summaries)+1}**: Insufficient content for full summary")
-    return f"**Chars**: {len(text)}\n**Time**: {time.time()-start:.2f}s\n\n" + "\n\n".join(summaries[:12])
-demo = gr.Interface(
-    fn=summarize_file, inputs=gr.File(label="📄 Any File", type="binary"),
-    outputs=gr.Textbox(label="📝 Summary"),
-    title="Fast Summarizer", description="300,000+ chars in ~10–15s (CPU)"
-)
-if __name__ == "__main__":
-    try:
-        demo.launch(share=False, server_port=7860)
-    except Exception as e:
-        print(f"❌ Gradio launch failed: {str(e)}")