tejovanth commited on
Commit
7cd95b1
Β·
verified Β·
1 Parent(s): 78bf1e3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +34 -71
app.py CHANGED
@@ -1,87 +1,50 @@
1
  import gradio as gr
 
 
2
  from transformers import pipeline
3
- import fitz # PyMuPDF for PDFs
4
- import pytesseract # For OCR (images)
5
- from PIL import Image
6
- import io
7
 
8
- # Load summarization model
9
- summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")
 
10
 
11
- # Function to extract text from different file types
12
- def extract_text(file_bytes):
13
- try:
14
- # file_bytes is already a bytes object
15
- header = file_bytes[:4]
16
-
17
- # Determine file type based on magic numbers
18
- if header.startswith(b'%PDF'):
19
- doc = fitz.open(stream=file_bytes, filetype="pdf")
20
- text = ""
21
- for page in doc:
22
- text += page.get_text()
23
- return text
24
-
25
- elif header.startswith(b'\xFF\xD8') or header.startswith(b'\x89PNG'):
26
- # It's an image (JPEG/PNG), use OCR
27
- image = Image.open(io.BytesIO(file_bytes))
28
- return pytesseract.image_to_string(image)
29
-
30
- else:
31
- # Try reading as plain text
32
- try:
33
- return file_bytes.decode("utf-8")
34
- except UnicodeDecodeError:
35
- return "❌ Unsupported file format or corrupted file."
36
 
 
 
 
 
 
 
37
  except Exception as e:
38
- return f"❌ Error reading file: {str(e)}"
39
-
40
- # Function to chunk text into smaller pieces
41
- def chunk_text(text, chunk_size=4000):
42
- return [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]
43
-
44
- # Summarize the extracted text
45
- def summarize_file(file_bytes):
46
- text = extract_text(file_bytes)
47
- if not text or len(text.strip()) == 0:
48
- return "❌ No text found in the uploaded file."
49
-
50
- # Ensure at least 300,000 characters can be processed (no truncation)
51
- if len(text) > 300000:
52
- text = text[:300000] # Optional: cap at 300,000 if desired, but can be removed for larger inputs
53
-
54
- # Chunk the text into 4,000-character segments
55
- chunks = chunk_text(text, chunk_size=4000)
56
- if not chunks:
57
- return "❌ No valid chunks to summarize."
58
-
59
- # Summarize each chunk
60
  summaries = []
61
  for i, chunk in enumerate(chunks):
 
 
 
62
  try:
63
- summary = summarizer(chunk, max_length=150, min_length=40, do_sample=False)
64
- summaries.append(f"**Chunk {i+1} Summary**:\n{summary[0]['summary_text']}")
65
  except Exception as e:
66
- summaries.append(f"**Chunk {i+1} Summary**: ❌ Error summarizing chunk: {str(e)}")
 
67
 
68
- # Combine summaries
69
- combined_summary = "\n\n".join(summaries)
70
- total_chars = len(text)
71
- return f"**Total Characters Processed**: {total_chars}\n\n**Summaries**:\n{combined_summary}"
72
 
73
- # Gradio UI
74
- demo = gr.Interface(
75
- fn=summarize_file,
76
- inputs=gr.File(label="πŸ“„ Upload Notes (PDF, TXT, or Handwritten Image)", type="binary"),
77
- outputs=gr.Textbox(label="πŸ“ Summarized Notes"),
78
- title="πŸ“š Note Summarizer",
79
- description="Upload academic notes in PDF, TXT, or image format (supports at least 300,000 characters). This app extracts and summarizes the content using a Hugging Face transformer model."
80
- )
81
-
82
- # Launch the interface
83
  if __name__ == "__main__":
84
- demo.launch()
 
 
 
85
 
86
 
87
 
 
1
  import gradio as gr
2
+ import fitz
3
+ import torch
4
  from transformers import pipeline
5
+ import time, logging
 
 
 
6
 
7
+ logging.basicConfig(level=logging.ERROR)
8
+ device = -1 # CPU-only
9
+ print("⚠️ CPU-only. Expect ~15–25s for 300,000 chars.")
10
 
11
+ try:
12
+ summarizer = pipeline("summarization", model="t5-small", device=device, torch_dtype=torch.float32)
13
+ except Exception as e:
14
+ print(f"❌ Model loading failed: {str(e)}")
15
+ exit(1)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
 
17
+ def summarize_file(file):
18
+ start = time.time()
19
+ try:
20
+ # Handle file as path (str) or bytes
21
+ file_bytes = open(file.name, "rb").read() if isinstance(file, gr.FileData) else file
22
+ text = "".join(page.get_text("text", flags=16) for page in fitz.open(stream=file_bytes, filetype="pdf")) if file_bytes[:4].startswith(b'%PDF') else file_bytes.decode("utf-8", errors="ignore")
23
  except Exception as e:
24
+ return f"❌ Text extraction failed: {str(e)}"
25
+ if not text.strip(): return "❌ No text found"
26
+ text = text[:300000]
27
+ chunks = [text[i:i+10000] for i in range(0, len(text), 10000)]
28
+ if gamba not chunks: return "❌ No chunks to summarize"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  summaries = []
30
  for i, chunk in enumerate(chunks):
31
+ if time.time() - start > 9:
32
+ summaries.append("⚠️ Stopped early")
33
+ break
34
  try:
35
+ summary = summarizer(chunk, max_length=40, min_length=10, do_sample=False)[0]['summary_text']
36
+ summaries.append(f"**Chunk {i+1}**:\n{summary}")
37
  except Exception as e:
38
+ summaries.append(f"**Chunk {i+1}**: ❌ Error: {str(e)}")
39
+ return f"**Chars**: {len(text)}\n**Time**: {time.time()-start:.2f}s\n\n" + "\n\n".join(summaries)
40
 
41
+ demo = gr.Interface(fn=summarize_file, inputs=gr.File(label="πŸ“„ PDF/TXT Notes"), outputs=gr.Textbox(label="πŸ“ Summary"), title="Fast Summarizer", description="300,000+ chars in ~15–25s (CPU)")
 
 
 
42
 
 
 
 
 
 
 
 
 
 
 
43
  if __name__ == "__main__":
44
+ try:
45
+ demo.launch(share=False, server_port=7860)
46
+ except Exception as e:
47
+ print(f"❌ Gradio launch failed: {str(e)}")
48
 
49
 
50