chmawia commited on
Commit
e76110f
Β·
verified Β·
1 Parent(s): 7196622

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +65 -42
app.py CHANGED
@@ -2,9 +2,10 @@ import streamlit as st
2
  import fitz # PyMuPDF
3
  import os
4
  import re
 
5
 
6
  # Configure Streamlit page
7
- st.set_page_config(page_title="Extract Key Info", layout="centered")
8
 
9
  # Custom Styling
10
  st.markdown(
@@ -26,14 +27,32 @@ st.markdown(
26
  )
27
 
28
  # Page title
29
- st.markdown("<h1 style='text-align: center;'>πŸ“„ Extract Key Points & MCQs</h1>", unsafe_allow_html=True)
30
 
31
  # File uploader
32
  uploaded_file = st.file_uploader("Upload a PDF file", type=["pdf"])
33
 
34
- # Function to extract only key points, MCQs, and important questions
35
- def extract_relevant_info(pdf_path):
 
 
 
36
  doc = fitz.open(pdf_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  key_points = []
38
  mcqs = []
39
  important_questions = []
@@ -43,61 +62,65 @@ def extract_relevant_info(pdf_path):
43
  question_pattern = r"^(What|Which|How|Why|When|Who|Where|Explain|Describe)\b"
44
  bullet_point_pattern = r"^(β€’|-|\*)\s"
45
 
46
- for page in doc:
47
- text = page.get_text("text")
48
- lines = text.split("\n")
49
 
50
- for line in lines:
51
- line = line.strip()
52
 
53
- # Extract MCQs
54
- if re.match(mcq_pattern, line):
55
- mcqs.append(line)
56
 
57
- # Extract Important Questions
58
- elif re.match(question_pattern, line, re.IGNORECASE):
59
- important_questions.append(line)
60
 
61
- # Extract Key Points (Bullets or Short Sentences)
62
- elif re.match(bullet_point_pattern, line) or (len(line) < 150 and "." in line):
63
- key_points.append(line)
64
 
65
  return key_points, mcqs, important_questions
66
 
67
- # Extract Data Button (Disabled until file is uploaded)
68
  if uploaded_file:
69
- extract_button = st.button("πŸš€ Extract Important Info", use_container_width=True)
70
-
71
  if extract_button:
72
  with st.spinner("Processing your PDF..."):
73
  temp_path = "temp.pdf"
74
  with open(temp_path, "wb") as f:
75
  f.write(uploaded_file.getbuffer())
76
 
77
- key_points, mcqs, important_questions = extract_relevant_info(temp_path)
78
  os.remove(temp_path)
79
 
80
- # Display extracted data
81
- st.success("βœ… Extraction Complete!")
82
-
83
- col1, col2 = st.columns(2)
 
84
 
85
- with col1:
86
- if key_points:
87
- st.subheader("πŸ“Œ Key Points")
88
- for point in key_points:
89
- st.write(f"- {point}")
90
-
91
- with col2:
92
- if mcqs:
93
- st.subheader("❓ MCQs")
94
- for question in mcqs:
95
- st.write(f"- {question}")
96
-
97
- if important_questions:
98
- st.subheader("❓ Important Questions")
99
- for question in important_questions:
100
- st.write(f"- {question}")
 
 
 
 
 
101
 
102
  else:
103
  st.warning("⚠️ Please upload a PDF file first.")
 
2
  import fitz # PyMuPDF
3
  import os
4
  import re
5
+ from transformers import pipeline
6
 
7
  # Configure Streamlit page
8
+ st.set_page_config(page_title="PDF Extractor", layout="centered")
9
 
10
  # Custom Styling
11
  st.markdown(
 
27
  )
28
 
29
  # Page title
30
+ st.markdown("<h1 style='text-align: center;'>πŸ“„ PDF Extractor</h1>", unsafe_allow_html=True)
31
 
32
  # File uploader
33
  uploaded_file = st.file_uploader("Upload a PDF file", type=["pdf"])
34
 
35
+ # Selection: Summarize or Generate MCQs & Key Points
36
+ task = st.radio("Select Task:", ["Summarize PDF", "Generate MCQs, Key Points, and Important Questions"])
37
+
38
+ # Function to extract text from PDF
39
+ def extract_text_from_pdf(pdf_path):
40
  doc = fitz.open(pdf_path)
41
+ text = ""
42
+
43
+ for page in doc:
44
+ text += page.get_text("text") + "\n"
45
+
46
+ return text.strip()
47
+
48
+ # Function to generate a summary using an AI model
49
+ def summarize_text(text):
50
+ summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
51
+ summary = summarizer(text, max_length=200, min_length=50, do_sample=False)
52
+ return summary[0]['summary_text']
53
+
54
+ # Function to extract only key points, MCQs, and important questions
55
+ def extract_relevant_info(text):
56
  key_points = []
57
  mcqs = []
58
  important_questions = []
 
62
  question_pattern = r"^(What|Which|How|Why|When|Who|Where|Explain|Describe)\b"
63
  bullet_point_pattern = r"^(β€’|-|\*)\s"
64
 
65
+ lines = text.split("\n")
 
 
66
 
67
+ for line in lines:
68
+ line = line.strip()
69
 
70
+ # Extract MCQs
71
+ if re.match(mcq_pattern, line):
72
+ mcqs.append(line)
73
 
74
+ # Extract Important Questions
75
+ elif re.match(question_pattern, line, re.IGNORECASE):
76
+ important_questions.append(line)
77
 
78
+ # Extract Key Points (Bullets or Short Sentences)
79
+ elif re.match(bullet_point_pattern, line) or (len(line) < 150 and "." in line):
80
+ key_points.append(line)
81
 
82
  return key_points, mcqs, important_questions
83
 
84
+ # Extract Data Button
85
  if uploaded_file:
86
+ extract_button = st.button("πŸš€ Extract Data", use_container_width=True)
87
+
88
  if extract_button:
89
  with st.spinner("Processing your PDF..."):
90
  temp_path = "temp.pdf"
91
  with open(temp_path, "wb") as f:
92
  f.write(uploaded_file.getbuffer())
93
 
94
+ extracted_text = extract_text_from_pdf(temp_path)
95
  os.remove(temp_path)
96
 
97
+ # Perform selected task
98
+ if task == "Summarize PDF":
99
+ st.subheader("πŸ“– Summary")
100
+ summary = summarize_text(extracted_text)
101
+ st.write(summary)
102
 
103
+ elif task == "Generate MCQs, Key Points, and Important Questions":
104
+ key_points, mcqs, important_questions = extract_relevant_info(extracted_text)
105
+
106
+ col1, col2 = st.columns(2)
107
+
108
+ with col1:
109
+ if key_points:
110
+ st.subheader("πŸ“Œ Key Points")
111
+ for point in key_points:
112
+ st.write(f"- {point}")
113
+
114
+ with col2:
115
+ if mcqs:
116
+ st.subheader("❓ MCQs")
117
+ for question in mcqs:
118
+ st.write(f"- {question}")
119
+
120
+ if important_questions:
121
+ st.subheader("❓ Important Questions")
122
+ for question in important_questions:
123
+ st.write(f"- {question}")
124
 
125
  else:
126
  st.warning("⚠️ Please upload a PDF file first.")