Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -15,25 +15,17 @@ def configure_gemini_api(api_key: str):
|
|
| 15 |
genai.configure(api_key=api_key)
|
| 16 |
return "β
API Key configured successfully!"
|
| 17 |
|
| 18 |
-
def extract_text_from_pdf(
|
| 19 |
"""Extract text from PDF using OCR"""
|
| 20 |
try:
|
| 21 |
-
# Create temporary file
|
| 22 |
-
with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp_file:
|
| 23 |
-
tmp_file.write(pdf_file)
|
| 24 |
-
tmp_path = tmp_file.name
|
| 25 |
-
|
| 26 |
# Convert PDF to images
|
| 27 |
-
pages = convert_from_path(
|
| 28 |
all_text = ""
|
| 29 |
|
| 30 |
for i, page in enumerate(pages):
|
| 31 |
text = pytesseract.image_to_string(page)
|
| 32 |
all_text += text + "\n"
|
| 33 |
|
| 34 |
-
# Clean up temporary file
|
| 35 |
-
os.unlink(tmp_path)
|
| 36 |
-
|
| 37 |
return all_text
|
| 38 |
except Exception as e:
|
| 39 |
return f"Error extracting text: {str(e)}"
|
|
@@ -124,25 +116,25 @@ def generate_mcqs_from_chunk(chunk: str, api_key: str) -> List[List[str]]:
|
|
| 124 |
def process_pdf_to_mcqs(pdf_file, api_key: str, chunk_size: int = 1500, progress=gr.Progress()) -> Tuple[str, str]:
|
| 125 |
"""Main function to process PDF and generate MCQs"""
|
| 126 |
if not api_key:
|
| 127 |
-
return "β Please provide your Gemini API key",
|
| 128 |
|
| 129 |
if not pdf_file:
|
| 130 |
-
return "β Please upload a PDF file",
|
| 131 |
|
| 132 |
try:
|
| 133 |
# Extract text from PDF
|
| 134 |
progress(0.1, desc="Extracting text from PDF...")
|
| 135 |
-
extracted_text = extract_text_from_pdf(pdf_file)
|
| 136 |
|
| 137 |
if extracted_text.startswith("Error"):
|
| 138 |
-
return extracted_text,
|
| 139 |
|
| 140 |
# Chunk the text
|
| 141 |
progress(0.2, desc="Chunking text...")
|
| 142 |
chunks = chunk_text(extracted_text, chunk_size)
|
| 143 |
|
| 144 |
if not chunks:
|
| 145 |
-
return "β No text could be extracted from the PDF",
|
| 146 |
|
| 147 |
# Generate MCQs from each chunk
|
| 148 |
all_mcq_data = []
|
|
@@ -160,22 +152,18 @@ def process_pdf_to_mcqs(pdf_file, api_key: str, chunk_size: int = 1500, progress
|
|
| 160 |
progress(0.95, desc="Creating Excel file...")
|
| 161 |
|
| 162 |
if not all_mcq_data:
|
| 163 |
-
return "β No MCQs could be generated from the PDF content",
|
| 164 |
|
| 165 |
# Create DataFrame
|
| 166 |
df = pd.DataFrame(all_mcq_data, columns=['Question', 'OptionA', 'OptionB', 'OptionC', 'OptionD', 'CorrectAnswer'])
|
| 167 |
|
| 168 |
-
# Create Excel file
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
df.to_excel(writer, index=False, sheet_name='MCQs')
|
| 172 |
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.xlsx')
|
| 177 |
-
temp_file.write(output.getvalue())
|
| 178 |
-
temp_file.close()
|
| 179 |
|
| 180 |
progress(1.0, desc="Complete!")
|
| 181 |
|
|
@@ -184,7 +172,7 @@ def process_pdf_to_mcqs(pdf_file, api_key: str, chunk_size: int = 1500, progress
|
|
| 184 |
return success_message, temp_file.name
|
| 185 |
|
| 186 |
except Exception as e:
|
| 187 |
-
return f"β Error processing PDF: {str(e)}",
|
| 188 |
|
| 189 |
# Create Gradio interface
|
| 190 |
def create_interface():
|
|
@@ -251,7 +239,7 @@ def create_interface():
|
|
| 251 |
outputs=[status_output, download_file],
|
| 252 |
show_progress=True
|
| 253 |
).then(
|
| 254 |
-
fn=lambda
|
| 255 |
inputs=[download_file],
|
| 256 |
outputs=[download_file]
|
| 257 |
)
|
|
|
|
| 15 |
genai.configure(api_key=api_key)
|
| 16 |
return "β
API Key configured successfully!"
|
| 17 |
|
| 18 |
+
def extract_text_from_pdf(pdf_file_path: str) -> str:
|
| 19 |
"""Extract text from PDF using OCR"""
|
| 20 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
# Convert PDF to images
|
| 22 |
+
pages = convert_from_path(pdf_file_path)
|
| 23 |
all_text = ""
|
| 24 |
|
| 25 |
for i, page in enumerate(pages):
|
| 26 |
text = pytesseract.image_to_string(page)
|
| 27 |
all_text += text + "\n"
|
| 28 |
|
|
|
|
|
|
|
|
|
|
| 29 |
return all_text
|
| 30 |
except Exception as e:
|
| 31 |
return f"Error extracting text: {str(e)}"
|
|
|
|
| 116 |
def process_pdf_to_mcqs(pdf_file, api_key: str, chunk_size: int = 1500, progress=gr.Progress()) -> Tuple[str, str]:
|
| 117 |
"""Main function to process PDF and generate MCQs"""
|
| 118 |
if not api_key:
|
| 119 |
+
return "β Please provide your Gemini API key", None
|
| 120 |
|
| 121 |
if not pdf_file:
|
| 122 |
+
return "β Please upload a PDF file", None
|
| 123 |
|
| 124 |
try:
|
| 125 |
# Extract text from PDF
|
| 126 |
progress(0.1, desc="Extracting text from PDF...")
|
| 127 |
+
extracted_text = extract_text_from_pdf(pdf_file.name)
|
| 128 |
|
| 129 |
if extracted_text.startswith("Error"):
|
| 130 |
+
return extracted_text, None
|
| 131 |
|
| 132 |
# Chunk the text
|
| 133 |
progress(0.2, desc="Chunking text...")
|
| 134 |
chunks = chunk_text(extracted_text, chunk_size)
|
| 135 |
|
| 136 |
if not chunks:
|
| 137 |
+
return "β No text could be extracted from the PDF", None
|
| 138 |
|
| 139 |
# Generate MCQs from each chunk
|
| 140 |
all_mcq_data = []
|
|
|
|
| 152 |
progress(0.95, desc="Creating Excel file...")
|
| 153 |
|
| 154 |
if not all_mcq_data:
|
| 155 |
+
return "β No MCQs could be generated from the PDF content", None
|
| 156 |
|
| 157 |
# Create DataFrame
|
| 158 |
df = pd.DataFrame(all_mcq_data, columns=['Question', 'OptionA', 'OptionB', 'OptionC', 'OptionD', 'CorrectAnswer'])
|
| 159 |
|
| 160 |
+
# Create temporary Excel file for download
|
| 161 |
+
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.xlsx', mode='wb')
|
| 162 |
+
temp_file.close() # Close to allow pandas to write to it
|
|
|
|
| 163 |
|
| 164 |
+
# Write Excel file
|
| 165 |
+
with pd.ExcelWriter(temp_file.name, engine='openpyxl') as writer:
|
| 166 |
+
df.to_excel(writer, index=False, sheet_name='MCQs')
|
|
|
|
|
|
|
|
|
|
| 167 |
|
| 168 |
progress(1.0, desc="Complete!")
|
| 169 |
|
|
|
|
| 172 |
return success_message, temp_file.name
|
| 173 |
|
| 174 |
except Exception as e:
|
| 175 |
+
return f"β Error processing PDF: {str(e)}", None
|
| 176 |
|
| 177 |
# Create Gradio interface
|
| 178 |
def create_interface():
|
|
|
|
| 239 |
outputs=[status_output, download_file],
|
| 240 |
show_progress=True
|
| 241 |
).then(
|
| 242 |
+
fn=lambda file_path: gr.update(visible=bool(file_path)) if file_path else gr.update(visible=False),
|
| 243 |
inputs=[download_file],
|
| 244 |
outputs=[download_file]
|
| 245 |
)
|