tejovanth commited on
Commit
f738250
Β·
verified Β·
1 Parent(s): 6441138

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +58 -36
app.py CHANGED
@@ -1,82 +1,105 @@
1
  import gradio as gr
2
- from transformers import pipeline
3
  import fitz # PyMuPDF for PDFs
4
- import pytesseract # For OCR (images)
5
- from PIL import Image
6
  import io
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
- # Load summarization model
9
- summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")
10
-
11
- # Function to extract text from different file types
12
  def extract_text(file_bytes):
13
  try:
14
- # file_bytes is already a bytes object
15
  header = file_bytes[:4]
16
 
17
- # Determine file type based on magic numbers
18
  if header.startswith(b'%PDF'):
19
  doc = fitz.open(stream=file_bytes, filetype="pdf")
20
  text = ""
21
- for page in doc:
22
- text += page.get_text()
 
23
  return text
24
 
25
- elif header.startswith(b'\xFF\xD8') or header.startswith(b'\x89PNG'):
26
- # It's an image (JPEG/PNG), use OCR
27
- image = Image.open(io.BytesIO(file_bytes))
28
- return pytesseract.image_to_string(image)
29
-
30
  else:
31
- # Try reading as plain text
32
  try:
33
  return file_bytes.decode("utf-8")
34
  except UnicodeDecodeError:
35
- return "❌ Unsupported file format or corrupted file."
36
 
37
  except Exception as e:
38
  return f"❌ Error reading file: {str(e)}"
39
 
40
- # Function to chunk text into smaller pieces
41
- def chunk_text(text, chunk_size=4000):
42
  return [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]
43
 
44
  # Summarize the extracted text
45
  def summarize_file(file_bytes):
 
46
  text = extract_text(file_bytes)
47
  if not text or len(text.strip()) == 0:
48
  return "❌ No text found in the uploaded file."
49
 
50
- # Ensure at least 300,000 characters can be processed (no truncation)
51
  if len(text) > 300000:
52
- text = text[:300000] # Optional: cap at 300,000 if desired, but can be removed for larger inputs
53
 
54
- # Chunk the text into 4,000-character segments
55
- chunks = chunk_text(text, chunk_size=4000)
56
  if not chunks:
57
  return "❌ No valid chunks to summarize."
58
 
59
- # Summarize each chunk
60
  summaries = []
61
- for i, chunk in enumerate(chunks):
 
 
 
 
 
 
 
62
  try:
63
- summary = summarizer(chunk, max_length=150, min_length=40, do_sample=False)
64
- summaries.append(f"**Chunk {i+1} Summary**:\n{summary[0]['summary_text']}")
 
 
 
 
 
 
 
 
65
  except Exception as e:
66
- summaries.append(f"**Chunk {i+1} Summary**: ❌ Error summarizing chunk: {str(e)}")
 
 
 
 
67
 
68
- # Combine summaries
69
  combined_summary = "\n\n".join(summaries)
70
- total_chars = len(text)
71
- return f"**Total Characters Processed**: {total_chars}\n\n**Summaries**:\n{combined_summary}"
72
 
73
  # Gradio UI
74
  demo = gr.Interface(
75
  fn=summarize_file,
76
- inputs=gr.File(label="πŸ“„ Upload Notes (PDF, TXT, or Handwritten Image)", type="binary"),
77
  outputs=gr.Textbox(label="πŸ“ Summarized Notes"),
78
- title="πŸ“š Note Summarizer",
79
- description="Upload academic notes in PDF, TXT, or image format (supports at least 300,000 characters). This app extracts and summarizes the content using a Hugging Face transformer model."
80
  )
81
 
82
  # Launch the interface
@@ -84,4 +107,3 @@ if __name__ == "__main__":
84
  demo.launch()
85
 
86
 
87
-
 
1
  import gradio as gr
 
2
  import fitz # PyMuPDF for PDFs
 
 
3
  import io
4
+ import torch
5
+ from transformers import pipeline
6
+ from tqdm import tqdm
7
+ import time
8
+
9
+ # Check for GPU (mandatory for 5–10s target)
10
+ device = 0 if torch.cuda.is_available() else -1
11
+ if device == -1:
12
+ print("⚠️ Warning: GPU not detected. 5–10s target requires a GPU. Expect slower performance.")
13
+
14
+ # Load summarization model (distilbart-cnn-6-6 is faster)
15
+ summarizer = pipeline(
16
+ "summarization",
17
+ model="sshleifer/distilbart-cnn-6-6",
18
+ device=device,
19
+ torch_dtype=torch.float16 if device == 0 else torch.float32 # Quantize on GPU
20
+ )
21
 
22
+ # Function to extract text from PDFs or text files (skip images for speed)
 
 
 
23
  def extract_text(file_bytes):
24
  try:
 
25
  header = file_bytes[:4]
26
 
 
27
  if header.startswith(b'%PDF'):
28
  doc = fitz.open(stream=file_bytes, filetype="pdf")
29
  text = ""
30
+ for page in tqdm(doc, desc="Extracting PDF pages", disable=True): # Silent progress
31
+ text += page.get_text("text", flags=fitz.TEXTFLAGS_TEXT) # Fast text-only extraction
32
+ doc.close()
33
  return text
34
 
 
 
 
 
 
35
  else:
 
36
  try:
37
  return file_bytes.decode("utf-8")
38
  except UnicodeDecodeError:
39
+ return "❌ Unsupported file format (images not supported for speed)."
40
 
41
  except Exception as e:
42
  return f"❌ Error reading file: {str(e)}"
43
 
44
+ # Function to chunk text
45
+ def chunk_text(text, chunk_size=10000):
46
  return [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]
47
 
48
  # Summarize the extracted text
49
  def summarize_file(file_bytes):
50
+ start_time = time.time()
51
  text = extract_text(file_bytes)
52
  if not text or len(text.strip()) == 0:
53
  return "❌ No text found in the uploaded file."
54
 
55
+ # Cap at 300,000 characters (optional, can remove for larger inputs)
56
  if len(text) > 300000:
57
+ text = text[:300000]
58
 
59
+ # Chunk into 10,000-character segments (~30 chunks for 300,000 chars)
60
+ chunks = chunk_text(text, chunk_size=10000)
61
  if not chunks:
62
  return "❌ No valid chunks to summarize."
63
 
64
+ # Summarize with batch processing
65
  summaries = []
66
+ batch_size = 8 if device == 0 else 2 # Large batch on GPU, small on CPU
67
+ max_chunks = 15 # Limit to ~150,000 chars for 5–10s (adjust as needed)
68
+
69
+ for i in range(0, min(len(chunks), max_chunks), batch_size):
70
+ if time.time() - start_time > 8: # Stop early if nearing 10s
71
+ summaries.append("⚠️ Stopped early to meet 5–10s target. Not all text summarized.")
72
+ break
73
+ batch = chunks[i:i + batch_size]
74
  try:
75
+ batch_summaries = summarizer(
76
+ batch,
77
+ max_length=100, # Shorter summaries for speed
78
+ min_length=20,
79
+ do_sample=False,
80
+ truncation=True,
81
+ batch_size=batch_size
82
+ )
83
+ for j, summary in enumerate(batch_summaries):
84
+ summaries.append(f"**Chunk {i+j+1} Summary**:\n{summary['summary_text']}")
85
  except Exception as e:
86
+ summaries.append(f"**Chunk {i+1} Summary**: ❌ Error: {str(e)}")
87
+
88
+ # Add note if not all chunks processed
89
+ if len(chunks) > max_chunks:
90
+ summaries.append(f"⚠️ Only {max_chunks} of {len(chunks)} chunks processed (~{max_chunks*10000} chars). Full processing may take ~12–15s.")
91
 
 
92
  combined_summary = "\n\n".join(summaries)
93
+ elapsed_time = time.time() - start_time
94
+ return f"**Total Characters Processed**: {min(len(text), max_chunks*10000)}\n**Time Taken**: {elapsed_time:.2f}s\n\n**Summaries**:\n{combined_summary}"
95
 
96
  # Gradio UI
97
  demo = gr.Interface(
98
  fn=summarize_file,
99
+ inputs=gr.File(label="πŸ“„ Upload Notes (PDF or TXT)", type="binary"),
100
  outputs=gr.Textbox(label="πŸ“ Summarized Notes"),
101
+ title="πŸ“š Ultra-Fast Note Summarizer",
102
+ description="Upload academic notes in PDF or TXT format (supports ~300,000 characters). Optimized for 5–10s runtime using a lightweight model and GPU. Images not supported for speed."
103
  )
104
 
105
  # Launch the interface
 
107
  demo.launch()
108
 
109