tejovanth commited on
Commit
53883a6
Β·
verified Β·
1 Parent(s): 8549f68

Delete app.py

Browse files
Files changed (1) hide show
  1. app.py +0 -94
app.py DELETED
@@ -1,94 +0,0 @@
1
- import gradio as gr
2
- import fitz
3
- import torch
4
- from transformers import pipeline
5
- import time, logging, re, pandas as pd, docx, pytesseract, openpyxl, textract, mimetypes
6
- from PIL import Image
7
- from io import BytesIO
8
- from striprtf.striprtf import rtf_to_text
9
-
10
- logging.basicConfig(level=logging.ERROR)
11
- device = -1 # CPU-only
12
- print("⚠️ CPU-only. Expect ~10–15s for 300,000 chars.")
13
-
14
- try:
15
- summarizer = pipeline("summarization", model="t5-small", device=device, torch_dtype=torch.float32)
16
- except Exception as e:
17
- print(f"❌ Model loading failed: {str(e)}")
18
- exit(1)
19
-
20
- def summarize_file(file):
21
- start = time.time()
22
- if not hasattr(file, 'read') or not hasattr(file, 'name'):
23
- return "❌ Invalid file: Missing read() or name attribute"
24
- print(f"File: {file.name}")
25
- try:
26
- file_bytes = file.read()
27
- if not isinstance(file_bytes, bytes) or len(file_bytes) == 0:
28
- return "❌ Invalid file: Empty or non-binary content"
29
- mime, _ = mimetypes.guess_type(file.name) or ('text/plain', None)
30
- text = ""
31
- if mime == 'application/pdf':
32
- try:
33
- doc = fitz.open(stream=file_bytes, filetype="pdf")
34
- text = "".join(page.get_text("text") for page in doc)
35
- except:
36
- return "❌ PDF parsing failed"
37
- elif mime in ['text/plain', 'text/rtf']:
38
- text = rtf_to_text(file_bytes.decode("utf-8", errors="ignore")) if mime == 'text/rtf' else file_bytes.decode("utf-8", errors="ignore")
39
- elif mime in ['text/csv', 'application/vnd.ms-excel']:
40
- text = " ".join(pd.read_csv(BytesIO(file_bytes)).astype(str).values.flatten())
41
- elif mime == 'application/vnd.openxmlformats-officedocument.wordprocessingml.document':
42
- doc = docx.Document(BytesIO(file_bytes))
43
- text = " ".join(p.text for p in doc.paragraphs if p.text)
44
- elif mime in ['image/jpeg', 'image/png']:
45
- img = Image.open(BytesIO(file_bytes)).convert('L').resize((int(img.width * 300 / img.height), 300))
46
- text = pytesseract.image_to_string(img)
47
- elif mime == 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet':
48
- df = pd.read_excel(BytesIO(file_bytes), engine='openpyxl')
49
- text = " ".join(df.astype(str).values.flatten())
50
- else:
51
- text = textract.process(file_bytes).decode("utf-8", errors="ignore")
52
- # Strict text cleaning
53
- text = re.sub(r"[^\x20-\x7E]", "", text) # Keep printable ASCII only
54
- text = re.sub(r"\$\s*([^$]+)\s*\$", r"\1", text)
55
- text = re.sub(r"\\cap", "intersection", text)
56
- text = re.sub(r"\s+", " ", text).strip()
57
- if not text or len(text) < 100 or sum(1 for c in text if c.isalnum()) < 50:
58
- return "❌ Extracted text invalid or too short"
59
- print(f"Extracted chars: {len(text)}")
60
- except Exception as e:
61
- return f"❌ Text extraction failed: {str(e)}"
62
- text = text[:300000]
63
- chunks = [text[i:i+1000] for i in range(0, len(text), 1000)]
64
- print(f"Chunks created: {len(chunks)}")
65
- if not chunks: return "❌ No chunks to summarize"
66
- # Select 12 chunks evenly spaced
67
- selected_indices = [int(i * len(chunks) / 12) for i in range(12)] if len(chunks) >= 12 else list(range(len(chunks)))
68
- summaries = []
69
- for i in selected_indices:
70
- chunk = chunks[i]
71
- if sum(1 for c in chunk if not c.isalnum()) / len(chunk) > 0.7:
72
- summaries.append(f"**Chunk {i+1}**: Skipped (equation-heavy)")
73
- continue
74
- try:
75
- summary = summarizer(chunk, max_length=40, min_length=10, do_sample=False)[0]['summary_text']
76
- summaries.append(f"**Chunk {i+1}**:\n{summary}")
77
- except Exception as e:
78
- summaries.append(f"**Chunk {i+1}**: ❌ Error: {str(e)}")
79
- # Pad to 12 summaries
80
- while len(summaries) < 12:
81
- summaries.append(f"**Chunk {len(summaries)+1}**: Insufficient content for full summary")
82
- return f"**Chars**: {len(text)}\n**Time**: {time.time()-start:.2f}s\n\n" + "\n\n".join(summaries[:12])
83
-
84
- demo = gr.Interface(
85
- fn=summarize_file, inputs=gr.File(label="πŸ“„ Any File", type="binary"),
86
- outputs=gr.Textbox(label="πŸ“ Summary"),
87
- title="Fast Summarizer", description="300,000+ chars in ~10–15s (CPU)"
88
- )
89
-
90
- if __name__ == "__main__":
91
- try:
92
- demo.launch(share=False, server_port=7860)
93
- except Exception as e:
94
- print(f"❌ Gradio launch failed: {str(e)}")