tejovanth commited on
Commit
3930fe6
Β·
verified Β·
1 Parent(s): 1ac6518

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +83 -0
app.py ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import fitz
3
+ import torch
4
+ from transformers import pipeline
5
+ import time, logging, re, pandas as pd, docx, pytesseract, openpyxl, textract, mimetypes
6
+ from PIL import Image
7
+ from io import BytesIO
8
+ from striprtf.striprtf import rtf_to_text
9
+
10
+ logging.basicConfig(level=logging.ERROR)
11
+ device = -1 # CPU-only
12
+ print("⚠️ CPU-only. Expect ~10–15s for 300,000 chars.")
13
+
14
+ try:
15
+ summarizer = pipeline("summarization", model="t5-small", device=device, torch_dtype=torch.float32)
16
+ except Exception as e:
17
+ print(f"❌ Model loading failed: {str(e)}")
18
+ exit(1)
19
+
20
+ def summarize_file(file):
21
+ start = time.time()
22
+ print(f"File: {file.name if hasattr(file, 'name') else 'unknown'}")
23
+ try:
24
+ file_bytes = file.read() if hasattr(file, 'read') else file
25
+ mime, _ = mimetypes.guess_type(file.name) if hasattr(file, 'name') else (None, None)
26
+ text = ""
27
+ if mime == 'application/pdf':
28
+ doc = fitz.open(stream=file_bytes, filetype="pdf")
29
+ text = "".join(page.get_text("text") for page in doc)
30
+ elif mime in ['text/plain', 'text/rtf']:
31
+ text = rtf_to_text(file_bytes.decode("utf-8", errors="ignore")) if mime == 'text/rtf' else file_bytes.decode("utf-8", errors="ignore")
32
+ elif mime in ['text/csv', 'application/vnd.ms-excel']:
33
+ text = " ".join(pd.read_csv(BytesIO(file_bytes)).astype(str).values.flatten())
34
+ elif mime == 'application/vnd.openxmlformats-officedocument.wordprocessingml.document':
35
+ doc = docx.Document(BytesIO(file_bytes))
36
+ text = " ".join(p.text for p in doc.paragraphs if p.text)
37
+ elif mime in ['image/jpeg', 'image/png']:
38
+ img = Image.open(BytesIO(file_bytes)).convert('L').resize((int(img.width * 300 / img.height), 300))
39
+ text = pytesseract.image_to_string(img)
40
+ elif mime == 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet':
41
+ df = pd.read_excel(BytesIO(file_bytes), engine='openpyxl')
42
+ text = " ".join(df.astype(str).values.flatten())
43
+ else:
44
+ text = textract.process(file_bytes).decode("utf-8", errors="ignore")
45
+ text = re.sub(r"\$\s*([^$]+)\s*\$", r"\1", text)
46
+ text = re.sub(r"\\cap", "intersection", text)
47
+ text = re.sub(r"\s+", " ", text).strip()
48
+ text = "".join(c for c in text if ord(c) < 128)
49
+ print(f"Extracted chars: {len(text)}")
50
+ except Exception as e:
51
+ return f"❌ Text extraction failed: {str(e)}"
52
+ if not text.strip(): return "❌ No text found"
53
+ text = text[:300000]
54
+ chunks = [text[i:i+1000] for i in range(0, len(text), 1000)]
55
+ print(f"Chunks created: {len(chunks)}")
56
+ if not chunks: return "❌ No chunks to summarize"
57
+ summaries = []
58
+ for i in range(0, len(chunks), 4):
59
+ if time.time() - start > 15:
60
+ summaries.append("⚠️ Stopped early")
61
+ break
62
+ batch = chunks[i:i+4]
63
+ if any(sum(1 for c in chunk if not c.isalnum()) / len(chunk) > 0.7 for chunk in batch):
64
+ summaries.append(f"**Chunk {i+1}–{i+len(batch)}**: Skipped (equation-heavy)")
65
+ continue
66
+ try:
67
+ results = summarizer(batch, max_length=50, min_length=10, do_sample=False)
68
+ summaries.extend(f"**Chunk {i+j+1}**:\n{r['summary_text']}" for j, r in enumerate(results))
69
+ except Exception as e:
70
+ summaries.append(f"**Chunk {i+1}–{i+len(batch)}**: ❌ Error: {str(e)}")
71
+ return f"**Chars**: {len(text)}\n**Time**: {time.time()-start:.2f}s\n\n" + "\n\n".join(summaries)
72
+
73
+ demo = gr.Interface(
74
+ fn=summarize_file, inputs=gr.File(label="πŸ“„ Any File", type="file"),
75
+ outputs=gr.Textbox(label="πŸ“ Summary"),
76
+ title="Fast Summarizer", description="300,000+ chars in ~10–15s (CPU)"
77
+ )
78
+
79
+ if __name__ == "__main__":
80
+ try:
81
+ demo.launch(share=False, server_port=7860)
82
+ except Exception as e:
83
+ print(f"❌ Gradio launch failed: {str(e)}")