tejovanth commited on
Commit
6f47432
Β·
verified Β·
1 Parent(s): f738250

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +30 -91
app.py CHANGED
@@ -1,109 +1,48 @@
1
  import gradio as gr
2
- import fitz # PyMuPDF for PDFs
3
- import io
4
  import torch
5
  from transformers import pipeline
6
- from tqdm import tqdm
7
- import time
8
 
9
- # Check for GPU (mandatory for 5–10s target)
10
  device = 0 if torch.cuda.is_available() else -1
11
- if device == -1:
12
- print("⚠️ Warning: GPU not detected. 5–10s target requires a GPU. Expect slower performance.")
13
 
14
- # Load summarization model (distilbart-cnn-6-6 is faster)
15
- summarizer = pipeline(
16
- "summarization",
17
- model="sshleifer/distilbart-cnn-6-6",
18
- device=device,
19
- torch_dtype=torch.float16 if device == 0 else torch.float32 # Quantize on GPU
20
- )
21
 
22
- # Function to extract text from PDFs or text files (skip images for speed)
23
  def extract_text(file_bytes):
24
- try:
25
- header = file_bytes[:4]
26
-
27
- if header.startswith(b'%PDF'):
28
- doc = fitz.open(stream=file_bytes, filetype="pdf")
29
- text = ""
30
- for page in tqdm(doc, desc="Extracting PDF pages", disable=True): # Silent progress
31
- text += page.get_text("text", flags=fitz.TEXTFLAGS_TEXT) # Fast text-only extraction
32
- doc.close()
33
- return text
34
-
35
- else:
36
- try:
37
- return file_bytes.decode("utf-8")
38
- except UnicodeDecodeError:
39
- return "❌ Unsupported file format (images not supported for speed)."
40
-
41
- except Exception as e:
42
- return f"❌ Error reading file: {str(e)}"
43
-
44
- # Function to chunk text
45
- def chunk_text(text, chunk_size=10000):
46
- return [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]
47
-
48
- # Summarize the extracted text
49
- def summarize_file(file_bytes):
50
- start_time = time.time()
51
- text = extract_text(file_bytes)
52
- if not text or len(text.strip()) == 0:
53
- return "❌ No text found in the uploaded file."
54
-
55
- # Cap at 300,000 characters (optional, can remove for larger inputs)
56
- if len(text) > 300000:
57
- text = text[:300000]
58
-
59
- # Chunk into 10,000-character segments (~30 chunks for 300,000 chars)
60
- chunks = chunk_text(text, chunk_size=10000)
61
- if not chunks:
62
- return "❌ No valid chunks to summarize."
63
-
64
- # Summarize with batch processing
65
  summaries = []
66
- batch_size = 8 if device == 0 else 2 # Large batch on GPU, small on CPU
67
- max_chunks = 15 # Limit to ~150,000 chars for 5–10s (adjust as needed)
68
-
69
- for i in range(0, min(len(chunks), max_chunks), batch_size):
70
- if time.time() - start_time > 8: # Stop early if nearing 10s
71
- summaries.append("⚠️ Stopped early to meet 5–10s target. Not all text summarized.")
72
  break
73
- batch = chunks[i:i + batch_size]
74
  try:
75
- batch_summaries = summarizer(
76
- batch,
77
- max_length=100, # Shorter summaries for speed
78
- min_length=20,
79
- do_sample=False,
80
- truncation=True,
81
- batch_size=batch_size
82
- )
83
- for j, summary in enumerate(batch_summaries):
84
- summaries.append(f"**Chunk {i+j+1} Summary**:\n{summary['summary_text']}")
85
- except Exception as e:
86
- summaries.append(f"**Chunk {i+1} Summary**: ❌ Error: {str(e)}")
87
-
88
- # Add note if not all chunks processed
89
- if len(chunks) > max_chunks:
90
- summaries.append(f"⚠️ Only {max_chunks} of {len(chunks)} chunks processed (~{max_chunks*10000} chars). Full processing may take ~12–15s.")
91
-
92
- combined_summary = "\n\n".join(summaries)
93
- elapsed_time = time.time() - start_time
94
- return f"**Total Characters Processed**: {min(len(text), max_chunks*10000)}\n**Time Taken**: {elapsed_time:.2f}s\n\n**Summaries**:\n{combined_summary}"
95
 
96
- # Gradio UI
97
  demo = gr.Interface(
98
- fn=summarize_file,
99
- inputs=gr.File(label="πŸ“„ Upload Notes (PDF or TXT)", type="binary"),
100
- outputs=gr.Textbox(label="πŸ“ Summarized Notes"),
101
- title="πŸ“š Ultra-Fast Note Summarizer",
102
- description="Upload academic notes in PDF or TXT format (supports ~300,000 characters). Optimized for 5–10s runtime using a lightweight model and GPU. Images not supported for speed."
103
  )
104
 
105
- # Launch the interface
106
  if __name__ == "__main__":
107
- demo.launch()
108
 
109
 
 
1
  import gradio as gr
2
+ import fitz
 
3
  import torch
4
  from transformers import pipeline
5
+ import time, io
 
6
 
 
7
  device = 0 if torch.cuda.is_available() else -1
8
+ if device == -1: raise RuntimeError("GPU required for 5–10s target")
 
9
 
10
+ summarizer = pipeline("summarization", model="google/pegasus-xsum", device=device, torch_dtype=torch.int8)
 
 
 
 
 
 
11
 
 
12
  def extract_text(file_bytes):
13
+ if file_bytes[:4].startswith(b'%PDF'):
14
+ doc = fitz.open(stream=file_bytes, filetype="pdf")
15
+ text = "".join(page.get_text("text", flags=16) for page in doc)
16
+ doc.close()
17
+ return text
18
+ try: return file_bytes.decode("utf-8")
19
+ except: return "❌ Unsupported format (PDF/TXT only)"
20
+
21
+ async def summarize_file(file_bytes):
22
+ start = time.time()
23
+ text = extract_text(file_bytes)[:300000] or "❌ No text found"
24
+ if len(text.strip()) == 0: return text
25
+ chunks = [text[i:i+15000] for i in range(0, len(text), 15000)]
26
+ if not chunks: return "❌ No chunks to summarize"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  summaries = []
28
+ for i in range(0, len(chunks), 10):
29
+ if time.time() - start > 7:
30
+ summaries.append("⚠️ Stopped early")
 
 
 
31
  break
32
+ batch = chunks[i:i+10]
33
  try:
34
+ batch_summaries = summarizer(batch, max_length=40, min_length=10, do_sample=False, batch_size=10)
35
+ summaries.extend(f"**Chunk {i+j+1}**:\n{s['summary_text']}" for j, s in enumerate(batch_summaries))
36
+ except: summaries.append(f"**Chunk {i+1}**: ❌ Error")
37
+ return f"**Chars**: {len(text)}\n**Time**: {time.time()-start:.2f}s\n\n" + "\n\n".join(summaries)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
 
 
39
  demo = gr.Interface(
40
+ fn=summarize_file, inputs=gr.File(label="πŸ“„ PDF/TXT Notes"),
41
+ outputs=gr.Textbox(label="πŸ“ Summary"),
42
+ title="Fast Summarizer", description="300,000+ chars in ~5s (GPU)"
 
 
43
  )
44
 
 
45
  if __name__ == "__main__":
46
+ demo.launch(share=False)
47
 
48