DocQAgen / app.py
Ankit Singh
Added application with all its file
085eaee
import gradio as gr
import json
import os
import tempfile
from groq import Groq
from components.process_pdf import extract_text_from_pdf
from components.chunking import create_chunks
from components.llm import create_qa_pairs
# Validate the API key
def validate_api_key(api_key):
try:
client = Groq(api_key=api_key)
response = client.chat.completions.create(
model="llama3-70b-8192",
messages=[
{"role": "system", "content": "Validation test for API key."},
{"role": "user", "content": "Hello, world!"}
],
temperature=0.1,
max_tokens=5,
)
if response and response.choices:
return True, "API key is valid!"
else:
return False, "API key validation failed: Empty response."
except Exception as e:
return False, f"Invalid API key. Error: {str(e)}"
def create_interface():
def process_pdf(pdf_file, chunk_size, context_size, api_key):
try:
if not api_key:
return {"error": "Please provide a valid API key"}, None
if pdf_file is None:
return {"error": "Please upload a PDF file"}, None
context_window = context_size
pages = extract_text_from_pdf(pdf_file)
if not pages:
return {"error": "Could not extract text from PDF"}, None
chunks = create_chunks(pages, chunk_size)
qa_pairs = create_qa_pairs(chunks, api_key, context_window)
if not qa_pairs:
return {"error": "No QA pairs could be generated"}, None
dataset = {
"metadata": {
"total_pairs": len(qa_pairs),
"total_pages": len(pages),
"chunk_size": chunk_size,
"context_window": context_size
},
"qa_pairs": qa_pairs
}
# Create a temporary file for download
with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.json') as tmp_file:
json.dump(dataset, tmp_file, indent=2)
tmp_file_path = tmp_file.name
return dataset, tmp_file_path
except Exception as e:
return {"error": f"Processing failed: {str(e)}"}, None
def on_login(api_key):
is_valid, message = validate_api_key(api_key)
if is_valid:
return api_key, gr.update(visible=False), gr.update(visible=True), gr.Markdown(f"**Success!** {message}")
else:
return "", gr.update(visible=True), gr.update(visible=False), gr.Markdown(f"**Error!** {message}", visible=True)
with gr.Blocks() as app:
state = gr.State()
# Login Section
with gr.Column(visible=True) as login_section:
gr.Markdown("# Login to Use the PDF to QA Dataset Creator")
api_key_input = gr.Textbox(label="Enter your API Key", type="password")
login_button = gr.Button("Login")
login_feedback = gr.Markdown(visible=False)
# Main Functionality Section
with gr.Column(visible=False) as main_section:
gr.Markdown("# Enhanced PDF to QA Dataset Creator")
with gr.Row():
pdf_input = gr.File(label="Upload PDF")
with gr.Row():
chunk_size = gr.Slider(
minimum=500,
maximum=2000,
value=1000,
step=100,
label="Text Chunk Size"
)
context_size = gr.Slider(
minimum=1024,
maximum=4096,
value=2048,
step=256,
label="LLM Context Window Size"
)
create_button = gr.Button("Create Dataset")
preview = gr.JSON(label="Dataset Preview")
download_btn = gr.File(
label="Download Dataset",
file_count="single",
type="filepath",
visible=False
)
gr.Markdown("""Features:
- Generates diverse question types (factual, conceptual, analytical)
- Maintains page and chunk context for better coherence
- Configurable chunk size and context window
- Complete dataset available for download""")
create_button.click(
fn=process_pdf,
inputs=[pdf_input, chunk_size, context_size, state],
outputs=[preview, download_btn]
)
# Linking Login and Main Section
login_button.click(
on_login,
inputs=api_key_input,
outputs=[state, login_section, main_section, login_feedback]
)
return app
if __name__ == "__main__":
app = create_interface()
app.launch()