tejovanth commited on
Commit
47a8c64
Β·
verified Β·
1 Parent(s): 1ae4c5e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +68 -64
app.py CHANGED
@@ -1,110 +1,114 @@
1
  import gradio as gr
2
- import fitz # PyMuPDF
3
  import torch
4
  from transformers import pipeline
5
- import time, logging
6
- import re
 
 
7
  import tempfile
8
 
9
- # === Setup Logging and Device ===
10
  logging.basicConfig(level=logging.ERROR)
11
- device = -1 # CPU
12
- print("⚠️ CPU-only mode. Expect ~20–30s for large documents.")
13
 
14
- # === Load Summarization Model ===
15
  try:
16
  summarizer = pipeline("summarization", model="t5-small", device=device, torch_dtype=torch.float32)
17
  except Exception as e:
18
- print(f"❌ Model loading failed: {e}")
19
  exit(1)
20
 
21
- # === Sentence-based Chunking ===
22
- def smart_chunk(text, max_chunk_len=2000):
23
- sentences = re.split(r'(?<=[.!?]) +', text)
24
- chunks, current_chunk = [], ""
25
- for sentence in sentences:
26
- if len(current_chunk) + len(sentence) < max_chunk_len:
27
- current_chunk += sentence + " "
28
- else:
29
- chunks.append(current_chunk.strip())
30
- current_chunk = sentence + " "
31
- if current_chunk:
32
- chunks.append(current_chunk.strip())
33
- return chunks
34
-
35
- # === Summarization for One File ===
36
  def summarize_file_bytes(file_bytes, filename):
37
- start_time = time.time()
38
  try:
39
- if file_bytes[:4].startswith(b'%PDF'):
40
- text = "".join(page.get_text("text", flags=16) for page in fitz.open(stream=file_bytes, filetype="pdf"))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
  else:
42
- text = file_bytes.decode("utf-8", errors="ignore")
 
 
 
 
 
 
 
43
  except Exception as e:
44
- return f"{filename}: ❌ Text extraction failed: {e}", ""
45
-
46
- text = text.strip()
47
- if not text:
48
- return f"{filename}: ❌ No text found.", ""
49
-
50
  text = text[:300000]
51
- chunks = smart_chunk(text)
52
- summaries, line_count = [], 0
53
-
54
- for i, chunk in enumerate(chunks):
55
- if time.time() - start_time > 20:
56
- summaries.append("⚠️ Stopped early due to time limit.")
57
- break
 
 
 
 
58
  try:
59
- summary = summarizer(chunk, max_length=100, min_length=30, do_sample=False)[0]["summary_text"]
60
- summaries.append(f"**Chunk {i+1}**:\n{summary.strip()}")
61
- line_count += summary.count('\n') + 1
62
- if line_count >= 15:
63
- break
64
  except Exception as e:
65
- summaries.append(f"**Chunk {i+1}**: ❌ Error summarizing: {e}")
66
-
67
- total_time = time.time() - start_time
68
- summary_text = f"πŸ“„ **{filename}**\n**Characters**: {len(text)} | **Time**: {total_time:.2f}s\n\n" + "\n\n".join(summaries)
69
  return summary_text, summary_text
70
 
71
- # === Multiple Files Handler ===
72
- def summarize_multiple_files(file_objs):
 
73
  all_summaries = []
74
  combined_text = ""
75
-
76
- for file_bytes, file_info in file_objs:
77
- filename = file_info['name'].split("/")[-1]
 
 
 
78
  summary, raw_text = summarize_file_bytes(file_bytes, filename)
79
  all_summaries.append(summary)
80
  combined_text += f"\n\n{raw_text}\n" + "="*60 + "\n"
81
-
82
- # Save combined summary to a temp .txt file
83
  with tempfile.NamedTemporaryFile(delete=False, suffix=".txt", mode="w", encoding="utf-8") as f:
84
  f.write(combined_text)
85
  summary_file_path = f.name
86
-
87
  return "\n\n".join(all_summaries), summary_file_path
88
 
89
- # === Gradio Interface ===
90
  demo = gr.Interface(
91
  fn=summarize_multiple_files,
92
- inputs=gr.File(label="πŸ“„ Upload PDF or TXT files", file_types=[".pdf", ".txt"], type="binary", file_count="multiple"),
93
  outputs=[
94
- gr.Textbox(label="πŸ“ Summary", lines=30, max_lines=100),
95
  gr.File(label="πŸ“₯ Download Summary as .txt")
96
  ],
97
  title="πŸ“š Multi-File Summarizer",
98
- description="Summarizes multiple PDFs or TXTs into at least 15 lines each. Download final output as .txt. CPU-optimized."
99
  )
100
 
101
- # === Launch App ===
102
  if __name__ == "__main__":
103
  try:
104
  demo.launch(share=False, server_port=7860)
105
  except Exception as e:
106
- print(f"❌ Gradio launch failed: {e}")
107
-
108
 
109
 
110
 
 
1
  import gradio as gr
2
+ import fitz
3
  import torch
4
  from transformers import pipeline
5
+ import time, logging, re, pandas as pd, docx, pytesseract, openpyxl, textract, mimetypes
6
+ from PIL import Image
7
+ from io import BytesIO
8
+ from striprtf.striprtf import rtf_to_text
9
  import tempfile
10
 
 
11
  logging.basicConfig(level=logging.ERROR)
12
+ device = -1 # CPU-only
13
+ print("⚠️ CPU-only. Expect ~5–9s for 300,000 chars.")
14
 
 
15
  try:
16
  summarizer = pipeline("summarization", model="t5-small", device=device, torch_dtype=torch.float32)
17
  except Exception as e:
18
+ print(f"❌ Model loading failed: {str(e)}")
19
  exit(1)
20
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  def summarize_file_bytes(file_bytes, filename):
22
+ start = time.time()
23
  try:
24
+ if not isinstance(file_bytes, bytes) or len(file_bytes) == 0:
25
+ return f"❌ {filename}: Invalid or empty file", ""
26
+ mime, _ = mimetypes.guess_type(filename) or ('text/plain', None)
27
+ text = ""
28
+ if mime == 'application/pdf':
29
+ doc = fitz.open(stream=file_bytes, filetype="pdf")
30
+ text = "".join(page.get_text("text") for page in doc)
31
+ elif mime in ['text/plain', 'text/rtf']:
32
+ text = rtf_to_text(file_bytes.decode("utf-8", errors="ignore")) if mime == 'text/rtf' else file_bytes.decode("utf-8", errors="ignore")
33
+ elif mime in ['text/csv', 'application/vnd.ms-excel']:
34
+ text = " ".join(pd.read_csv(BytesIO(file_bytes)).astype(str).values.flatten())
35
+ elif mime == 'application/vnd.openxmlformats-officedocument.wordprocessingml.document':
36
+ doc = docx.Document(BytesIO(file_bytes))
37
+ text = " ".join(p.text for p in doc.paragraphs if p.text)
38
+ elif mime in ['image/jpeg', 'image/png']:
39
+ img = Image.open(BytesIO(file_bytes)).convert('L').resize((int(img.width * 300 / img.height), 300))
40
+ text = pytesseract.image_to_string(img)
41
+ elif mime == 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet':
42
+ df = pd.read_excel(BytesIO(file_bytes), engine='openpyxl')
43
+ text = " ".join(df.astype(str).values.flatten())
44
  else:
45
+ text = textract.process(file_bytes).decode("utf-8", errors="ignore")
46
+ text = re.sub(r"[^\x20-\x7E]", "", text) # Printable ASCII only
47
+ text = re.sub(r"\$\s*([^$]+)\s*\$", r"\1", text)
48
+ text = re.sub(r"\\cap", "intersection", text)
49
+ text = re.sub(r"\s+", " ", text).strip()
50
+ if not text or len(text) < 100 or sum(1 for c in text if c.isalnum()) < 50:
51
+ return f"❌ {filename}: Invalid or too short text", ""
52
+ print(f"Extracted chars for {filename}: {len(text)}")
53
  except Exception as e:
54
+ return f"❌ {filename}: Text extraction failed: {str(e)}", ""
 
 
 
 
 
55
  text = text[:300000]
56
+ chunks = [text[i:i+1000] for i in range(0, len(text), 1000)]
57
+ print(f"Chunks for {filename}: {len(chunks)}")
58
+ if not chunks:
59
+ return f"❌ {filename}: No chunks to summarize", ""
60
+ selected_indices = [int(i * len(chunks) / 12) for i in range(12)] if len(chunks) >= 12 else list(range(len(chunks)))
61
+ summaries = []
62
+ for i in selected_indices:
63
+ chunk = chunks[i]
64
+ if sum(1 for c in chunk if not c.isalnum()) / len(chunk) > 0.7:
65
+ summaries.append(f"**Chunk {i+1}**: Skipped (equation-heavy)")
66
+ continue
67
  try:
68
+ summary = summarizer(chunk, max_length=40, min_length=10, do_sample=False)[0]['summary_text']
69
+ summaries.append(f"**Chunk {i+1}**:\n{summary}")
 
 
 
70
  except Exception as e:
71
+ summaries.append(f"**Chunk {i+1}**: ❌ Error: {str(e)}")
72
+ while len(summaries) < 12:
73
+ summaries.append(f"**Chunk {len(summaries)+1}**: Insufficient content")
74
+ summary_text = f"πŸ“„ **{filename}**\n**Chars**: {len(text)}\n**Time**: {time.time()-start:.2f}s\n\n" + "\n\n".join(summaries[:12])
75
  return summary_text, summary_text
76
 
77
+ def summarize_multiple_files(*file_objs):
78
+ if not file_objs or not any(file_objs):
79
+ return "❌ No files uploaded", None
80
  all_summaries = []
81
  combined_text = ""
82
+ for file in file_objs[0] if isinstance(file_objs[0], list) else file_objs:
83
+ if not hasattr(file, 'read') or not hasattr(file, 'name'):
84
+ all_summaries.append(f"❌ Invalid file: Missing read() or name")
85
+ continue
86
+ filename = file.name.split("/")[-1]
87
+ file_bytes = file.read()
88
  summary, raw_text = summarize_file_bytes(file_bytes, filename)
89
  all_summaries.append(summary)
90
  combined_text += f"\n\n{raw_text}\n" + "="*60 + "\n"
 
 
91
  with tempfile.NamedTemporaryFile(delete=False, suffix=".txt", mode="w", encoding="utf-8") as f:
92
  f.write(combined_text)
93
  summary_file_path = f.name
 
94
  return "\n\n".join(all_summaries), summary_file_path
95
 
 
96
  demo = gr.Interface(
97
  fn=summarize_multiple_files,
98
+ inputs=gr.File(label="πŸ“„ Upload Any File", type="binary", file_count="multiple"),
99
  outputs=[
100
+ gr.Textbox(label="πŸ“ Summary", lines=15, max_lines=100),
101
  gr.File(label="πŸ“₯ Download Summary as .txt")
102
  ],
103
  title="πŸ“š Multi-File Summarizer",
104
+ description="Summarizes any file into exactly 15 lines. Download as .txt. ~5–9s for 300,000 chars (CPU)."
105
  )
106
 
 
107
  if __name__ == "__main__":
108
  try:
109
  demo.launch(share=False, server_port=7860)
110
  except Exception as e:
111
+ print(f"❌ Gradio launch failed: {str(e)}")
 
112
 
113
 
114