tejovanth commited on
Commit
7116300
Β·
verified Β·
1 Parent(s): 5228f47

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +45 -41
app.py CHANGED
@@ -1,64 +1,68 @@
1
  import gradio as gr
2
  from transformers import pipeline
3
  import fitz # PyMuPDF for PDFs
4
- import pytesseract
5
  from PIL import Image
 
6
 
7
- # Load Hugging Face summarization model
8
  summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")
9
 
 
 
 
 
 
 
 
10
 
11
- # Extract text from PDF
12
- def extract_text_from_pdf(pdf_file):
13
- doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
14
- text = ""
15
- for page in doc:
16
- text += page.get_text()
17
- return text
18
-
19
- # Extract text from TXT file
20
- def extract_text_from_txt(txt_file):
21
- return txt_file.read().decode("utf-8")
22
-
23
- # Extract text from image using OCR
24
- def extract_text_from_image(image_file):
25
- image = Image.open(image_file)
26
- return pytesseract.image_to_string(image)
27
 
28
- # Main function to handle various formats
29
- def summarize_uploaded_file(file):
30
- try:
31
- file_type = file.name.split(".")[-1].lower()
32
 
33
- if file_type == "pdf":
34
- text = extract_text_from_pdf(file)
35
- elif file_type == "txt":
36
- text = extract_text_from_txt(file)
37
- elif file_type in ["jpg", "jpeg", "png"]:
38
- text = extract_text_from_image(file)
39
  else:
40
- return "❌ Unsupported file type. Please upload PDF, TXT, or an image file."
 
 
 
 
41
 
42
- if len(text.strip()) == 0:
43
- return "❌ The file seems empty or has no readable content."
44
 
45
- text = text[:3000] # Truncate for summarization model
46
- summary = summarizer(text, max_length=150, min_length=40, do_sample=False)
47
- return summary[0]['summary_text']
 
 
48
 
49
- except Exception as e:
50
- return f"❌ Error: {str(e)}"
 
 
51
 
52
  # Gradio UI
53
  demo = gr.Interface(
54
- fn=summarize_uploaded_file,
55
- inputs=gr.File(label="πŸ“„ Upload Notes (PDF, TXT, or Handwritten Image)", type="file"),
56
- outputs=gr.Textbox(label="πŸ“ Summary"),
57
- title="🧠 Universal Note Summarizer",
58
- description="Upload academic notes as PDF, text, or an image of handwritten notes. The app extracts and summarizes the content using a Hugging Face model and OCR."
59
  )
60
 
61
  demo.launch()
62
 
63
 
 
 
 
64
 
 
1
  import gradio as gr
2
  from transformers import pipeline
3
  import fitz # PyMuPDF for PDFs
4
+ import pytesseract # For OCR (images)
5
  from PIL import Image
6
+ import io
7
 
8
+ # Load summarization model
9
  summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")
10
 
11
+ # Function to extract text from different file types
12
+ def extract_text(file_obj):
13
+ try:
14
+ # Read the file content
15
+ file_bytes = file_obj.read()
16
+ file_obj.seek(0)
17
+ header = file_bytes[:4]
18
 
19
+ # Determine file type based on magic numbers
20
+ if header.startswith(b'%PDF'):
21
+ doc = fitz.open(stream=file_bytes, filetype="pdf")
22
+ text = ""
23
+ for page in doc:
24
+ text += page.get_text()
25
+ return text
 
 
 
 
 
 
 
 
 
26
 
27
+ elif header.startswith(b'\xFF\xD8') or header.startswith(b'\x89PNG'):
28
+ # It's an image (JPEG/PNG), use OCR
29
+ image = Image.open(io.BytesIO(file_bytes))
30
+ return pytesseract.image_to_string(image)
31
 
 
 
 
 
 
 
32
  else:
33
+ # Try reading as plain text
34
+ try:
35
+ return file_bytes.decode("utf-8")
36
+ except UnicodeDecodeError:
37
+ return "❌ Unsupported file format or corrupted file."
38
 
39
+ except Exception as e:
40
+ return f"❌ Error reading file: {str(e)}"
41
 
42
+ # Summarize the extracted text
43
+ def summarize_file(file_obj):
44
+ text = extract_text(file_obj)
45
+ if not text or len(text.strip()) == 0:
46
+ return "❌ No text found in the uploaded file."
47
 
48
+ # Truncate to fit model token limit
49
+ text = text[:3000]
50
+ summary = summarizer(text, max_length=150, min_length=40, do_sample=False)
51
+ return summary[0]["summary_text"]
52
 
53
  # Gradio UI
54
  demo = gr.Interface(
55
+ fn=summarize_file,
56
+ inputs=gr.File(label="πŸ“„ Upload Notes (PDF, TXT, or Handwritten Image)", type="binary"),
57
+ outputs=gr.Textbox(label="πŸ“ Summarized Notes"),
58
+ title="πŸ“š Note Summarizer",
59
+ description="Upload academic notes in PDF, TXT, or image format. This app extracts and summarizes the content using a Hugging Face transformer model."
60
  )
61
 
62
  demo.launch()
63
 
64
 
65
+
66
+
67
+
68