tejovanth commited on
Commit
840cf52
Β·
verified Β·
1 Parent(s): 9a7487f

Upload app (6).py

Browse files
Files changed (1) hide show
  1. app (6).py +87 -0
app (6).py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from transformers import pipeline
3
+ import fitz # PyMuPDF for PDFs
4
+ import pytesseract # For OCR (images)
5
+ from PIL import Image
6
+ import io
7
+
8
+ # Load summarization model
9
+ summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")
10
+
11
+ # Function to extract text from different file types
12
+ def extract_text(file_bytes):
13
+ try:
14
+ # file_bytes is already a bytes object
15
+ header = file_bytes[:4]
16
+
17
+ # Determine file type based on magic numbers
18
+ if header.startswith(b'%PDF'):
19
+ doc = fitz.open(stream=file_bytes, filetype="pdf")
20
+ text = ""
21
+ for page in doc:
22
+ text += page.get_text()
23
+ return text
24
+
25
+ elif header.startswith(b'\xFF\xD8') or header.startswith(b'\x89PNG'):
26
+ # It's an image (JPEG/PNG), use OCR
27
+ image = Image.open(io.BytesIO(file_bytes))
28
+ return pytesseract.image_to_string(image)
29
+
30
+ else:
31
+ # Try reading as plain text
32
+ try:
33
+ return file_bytes.decode("utf-8")
34
+ except UnicodeDecodeError:
35
+ return "❌ Unsupported file format or corrupted file."
36
+
37
+ except Exception as e:
38
+ return f"❌ Error reading file: {str(e)}"
39
+
40
+ # Function to chunk text into smaller pieces
41
+ def chunk_text(text, chunk_size=4000):
42
+ return [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]
43
+
44
+ # Summarize the extracted text
45
+ def summarize_file(file_bytes):
46
+ text = extract_text(file_bytes)
47
+ if not text or len(text.strip()) == 0:
48
+ return "❌ No text found in the uploaded file."
49
+
50
+ # Ensure at least 300,000 characters can be processed (no truncation)
51
+ if len(text) > 300000:
52
+ text = text[:300000] # Optional: cap at 300,000 if desired, but can be removed for larger inputs
53
+
54
+ # Chunk the text into 4,000-character segments
55
+ chunks = chunk_text(text, chunk_size=4000)
56
+ if not chunks:
57
+ return "❌ No valid chunks to summarize."
58
+
59
+ # Summarize each chunk
60
+ summaries = []
61
+ for i, chunk in enumerate(chunks):
62
+ try:
63
+ summary = summarizer(chunk, max_length=150, min_length=40, do_sample=False)
64
+ summaries.append(f"**Chunk {i+1} Summary**:\n{summary[0]['summary_text']}")
65
+ except Exception as e:
66
+ summaries.append(f"**Chunk {i+1} Summary**: ❌ Error summarizing chunk: {str(e)}")
67
+
68
+ # Combine summaries
69
+ combined_summary = "\n\n".join(summaries)
70
+ total_chars = len(text)
71
+ return f"**Total Characters Processed**: {total_chars}\n\n**Summaries**:\n{combined_summary}"
72
+
73
+ # Gradio UI
74
+ demo = gr.Interface(
75
+ fn=summarize_file,
76
+ inputs=gr.File(label="πŸ“„ Upload Notes (PDF, TXT, or Handwritten Image)", type="binary"),
77
+ outputs=gr.Textbox(label="πŸ“ Summarized Notes"),
78
+ title="πŸ“š Note Summarizer",
79
+ description="Upload academic notes in PDF, TXT, or image format (supports at least 300,000 characters). This app extracts and summarizes the content using a Hugging Face transformer model."
80
+ )
81
+
82
+ # Launch the interface
83
+ if __name__ == "__main__":
84
+ demo.launch()
85
+
86
+
87
+