Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import json | |
| import os | |
| import tempfile | |
| from groq import Groq | |
| from components.process_pdf import extract_text_from_pdf | |
| from components.chunking import create_chunks | |
| from components.llm import create_qa_pairs | |
| # Validate the API key | |
| def validate_api_key(api_key): | |
| try: | |
| client = Groq(api_key=api_key) | |
| response = client.chat.completions.create( | |
| model="llama3-70b-8192", | |
| messages=[ | |
| {"role": "system", "content": "Validation test for API key."}, | |
| {"role": "user", "content": "Hello, world!"} | |
| ], | |
| temperature=0.1, | |
| max_tokens=5, | |
| ) | |
| if response and response.choices: | |
| return True, "API key is valid!" | |
| else: | |
| return False, "API key validation failed: Empty response." | |
| except Exception as e: | |
| return False, f"Invalid API key. Error: {str(e)}" | |
| def create_interface(): | |
| def process_pdf(pdf_file, chunk_size, context_size, api_key): | |
| try: | |
| if not api_key: | |
| return {"error": "Please provide a valid API key"}, None | |
| if pdf_file is None: | |
| return {"error": "Please upload a PDF file"}, None | |
| context_window = context_size | |
| pages = extract_text_from_pdf(pdf_file) | |
| if not pages: | |
| return {"error": "Could not extract text from PDF"}, None | |
| chunks = create_chunks(pages, chunk_size) | |
| qa_pairs = create_qa_pairs(chunks, api_key, context_window) | |
| if not qa_pairs: | |
| return {"error": "No QA pairs could be generated"}, None | |
| dataset = { | |
| "metadata": { | |
| "total_pairs": len(qa_pairs), | |
| "total_pages": len(pages), | |
| "chunk_size": chunk_size, | |
| "context_window": context_size | |
| }, | |
| "qa_pairs": qa_pairs | |
| } | |
| # Create a temporary file for download | |
| with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.json') as tmp_file: | |
| json.dump(dataset, tmp_file, indent=2) | |
| tmp_file_path = tmp_file.name | |
| return dataset, tmp_file_path | |
| except Exception as e: | |
| return {"error": f"Processing failed: {str(e)}"}, None | |
| def on_login(api_key): | |
| is_valid, message = validate_api_key(api_key) | |
| if is_valid: | |
| return api_key, gr.update(visible=False), gr.update(visible=True), gr.Markdown(f"**Success!** {message}") | |
| else: | |
| return "", gr.update(visible=True), gr.update(visible=False), gr.Markdown(f"**Error!** {message}", visible=True) | |
| with gr.Blocks() as app: | |
| state = gr.State() | |
| # Login Section | |
| with gr.Column(visible=True) as login_section: | |
| gr.Markdown("# Login to Use the PDF to QA Dataset Creator") | |
| api_key_input = gr.Textbox(label="Enter your API Key", type="password") | |
| login_button = gr.Button("Login") | |
| login_feedback = gr.Markdown(visible=False) | |
| # Main Functionality Section | |
| with gr.Column(visible=False) as main_section: | |
| gr.Markdown("# Enhanced PDF to QA Dataset Creator") | |
| with gr.Row(): | |
| pdf_input = gr.File(label="Upload PDF") | |
| with gr.Row(): | |
| chunk_size = gr.Slider( | |
| minimum=500, | |
| maximum=2000, | |
| value=1000, | |
| step=100, | |
| label="Text Chunk Size" | |
| ) | |
| context_size = gr.Slider( | |
| minimum=1024, | |
| maximum=4096, | |
| value=2048, | |
| step=256, | |
| label="LLM Context Window Size" | |
| ) | |
| create_button = gr.Button("Create Dataset") | |
| preview = gr.JSON(label="Dataset Preview") | |
| download_btn = gr.File( | |
| label="Download Dataset", | |
| file_count="single", | |
| type="filepath", | |
| visible=False | |
| ) | |
| gr.Markdown("""Features: | |
| - Generates diverse question types (factual, conceptual, analytical) | |
| - Maintains page and chunk context for better coherence | |
| - Configurable chunk size and context window | |
| - Complete dataset available for download""") | |
| create_button.click( | |
| fn=process_pdf, | |
| inputs=[pdf_input, chunk_size, context_size, state], | |
| outputs=[preview, download_btn] | |
| ) | |
| # Linking Login and Main Section | |
| login_button.click( | |
| on_login, | |
| inputs=api_key_input, | |
| outputs=[state, login_section, main_section, login_feedback] | |
| ) | |
| return app | |
| if __name__ == "__main__": | |
| app = create_interface() | |
| app.launch() |