Spaces:
Sleeping
Sleeping
| import numpy as np | |
| from sentence_transformers import SentenceTransformer | |
| import faiss | |
| import re | |
| import gradio as gr | |
| import PyPDF2 | |
| import tempfile | |
| import os | |
| def extract_text_from_pdf(pdf_file): | |
| """ | |
| Extract text from a PDF file for Hugging Face Spaces | |
| """ | |
| if pdf_file is None: | |
| return "Please upload a PDF file." | |
| pdf_text = "" | |
| try: | |
| # In Hugging Face Spaces, pdf_file is already a file path | |
| with open(pdf_file.name, 'rb') as f: | |
| pdf_reader = PyPDF2.PdfReader(f) | |
| for page_num in range(len(pdf_reader.pages)): | |
| page = pdf_reader.pages[page_num] | |
| pdf_text += page.extract_text() + "\n" | |
| except Exception as e: | |
| return f"Error processing PDF: {str(e)}" | |
| return pdf_text | |
| def preprocess_text(text): | |
| """ | |
| Preprocess the text into structured question-answer pairs | |
| """ | |
| # Split text into sections by questions | |
| sections = [] | |
| current_section = [] | |
| for line in text.split('\n'): | |
| line = line.strip() | |
| if line.startswith('Question'): | |
| if current_section: | |
| sections.append(' '.join(current_section)) | |
| current_section = [line] | |
| elif line: | |
| current_section.append(line) | |
| if current_section: | |
| sections.append(' '.join(current_section)) | |
| # Create a structured format | |
| structured_sections = [] | |
| for section in sections: | |
| # Remove page numbers and other irrelevant text | |
| section = re.sub(r'\d+\s*$', '', section) | |
| section = re.sub(r'TRAPS:|BEST ANSWER:|PASSABLE ANSWER:', ' ', section) | |
| structured_sections.append(section.strip()) | |
| return structured_sections | |
| def create_qa_system(pdf_text, model_name="all-MiniLM-L6-v2"): | |
| """ | |
| Create and return a QA system with the processed text | |
| """ | |
| # Process text into structured sections | |
| text_chunks = preprocess_text(pdf_text) | |
| # Create embeddings | |
| model = SentenceTransformer(model_name) | |
| embeddings = model.encode(text_chunks) | |
| # Create FAISS index with cosine similarity | |
| dimension = embeddings.shape[1] | |
| # Normalize vectors for cosine similarity | |
| faiss.normalize_L2(embeddings) | |
| index = faiss.IndexFlatIP(dimension) # Inner product for cosine similarity | |
| index.add(embeddings) | |
| return model, index, text_chunks | |
| def query_qa_system(question, model, index, text_chunks, similarity_threshold=0.3): | |
| """ | |
| Query the QA system with improved matching | |
| """ | |
| # Encode and normalize the question | |
| question_embedding = model.encode([question]) | |
| faiss.normalize_L2(question_embedding) | |
| # Search for the most similar chunks | |
| k = 1 # Get only the best match | |
| similarities, indices = index.search(question_embedding, k) | |
| best_idx = indices[0][0] | |
| similarity_score = similarities[0][0] # Cosine similarity score | |
| if similarity_score >= similarity_threshold: | |
| matched_text = text_chunks[best_idx] | |
| # Extract just the question number for reference | |
| question_num = re.search(r'Question \d+:', matched_text) | |
| question_num = question_num.group(0) if question_num else "Matching section" | |
| return { | |
| 'question': question_num, | |
| 'full_text': matched_text, | |
| 'confidence': float(similarity_score), | |
| 'found_answer': True | |
| } | |
| else: | |
| return { | |
| 'question': None, | |
| 'full_text': "I couldn't find a sufficiently relevant answer to your question in the provided document.", | |
| 'confidence': float(similarity_score), | |
| 'found_answer': False | |
| } | |
| # Global variables to store model, index, and text chunks | |
| global_model = None | |
| global_index = None | |
| global_text_chunks = None | |
| def upload_file(file): | |
| global global_model, global_index, global_text_chunks | |
| if file is not None: | |
| try: | |
| # Extract text from PDF | |
| pdf_text = extract_text_from_pdf(file) | |
| if isinstance(pdf_text, str) and pdf_text.startswith("Error"): | |
| return pdf_text | |
| # Initialize QA system | |
| global_model, global_index, global_text_chunks = create_qa_system(pdf_text) | |
| return "✅ Document processed successfully! You can now ask questions." | |
| except Exception as e: | |
| return f"❌ Error processing document: {str(e)}" | |
| else: | |
| return "❌ Please upload a PDF file." | |
| def answer_question(question): | |
| global global_model, global_index, global_text_chunks | |
| if global_model is None or global_index is None or global_text_chunks is None: | |
| return "Please upload and process a document first." | |
| if not question.strip(): | |
| return "Please enter a question." | |
| result = query_qa_system(question, global_model, global_index, global_text_chunks) | |
| if result['found_answer']: | |
| response = f"Found matching section (confidence: {result['confidence']:.2f}):\n\n{result['full_text']}" | |
| else: | |
| response = f"{result['full_text']}\nBest match confidence: {result['confidence']:.2f}" | |
| return response | |
| # Custom CSS for professional styling | |
| custom_css = """ | |
| .gradio-container { | |
| max-width: 1200px !important; | |
| margin: auto !important; | |
| padding: 20px !important; | |
| background-color: #f8f9fa !important; | |
| } | |
| .main-header { | |
| text-align: center; | |
| margin-bottom: 2rem; | |
| padding: 2rem; | |
| background: linear-gradient(135deg, #1a365d 0%, #2c5282 100%); | |
| color: white; | |
| border-radius: 10px; | |
| box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1); | |
| } | |
| .main-header h1 { | |
| font-size: 2.5rem; | |
| margin-bottom: 1rem; | |
| font-weight: 600; | |
| } | |
| .main-header p { | |
| font-size: 1.1rem; | |
| opacity: 0.9; | |
| } | |
| .upload-section { | |
| background: white; | |
| padding: 2rem; | |
| border-radius: 10px; | |
| box-shadow: 0 2px 4px rgba(0, 0, 0, 0.05); | |
| margin-bottom: 2rem; | |
| } | |
| .qa-section { | |
| background: white; | |
| padding: 2rem; | |
| border-radius: 10px; | |
| box-shadow: 0 2px 4px rgba(0, 0, 0, 0.05); | |
| } | |
| .status-box { | |
| margin-top: 1rem; | |
| padding: 1rem; | |
| border-radius: 8px; | |
| background: #f0f9ff; | |
| border: 1px solid #bae6fd; | |
| } | |
| .custom-button { | |
| background: #2563eb !important; | |
| color: white !important; | |
| border-radius: 8px !important; | |
| padding: 0.75rem 1.5rem !important; | |
| font-weight: 500 !important; | |
| } | |
| .custom-button:hover { | |
| background: #1d4ed8 !important; | |
| } | |
| .answer-box { | |
| background: #f8fafc !important; | |
| border: 1px solid #e2e8f0 !important; | |
| border-radius: 8px !important; | |
| font-family: 'Source Code Pro', monospace !important; | |
| } | |
| .section-title { | |
| color: #1e293b; | |
| font-size: 1.25rem; | |
| font-weight: 600; | |
| margin-bottom: 1rem; | |
| } | |
| /* Responsive design */ | |
| @media (max-width: 768px) { | |
| .gradio-container { | |
| padding: 10px !important; | |
| } | |
| .main-header { | |
| padding: 1.5rem; | |
| } | |
| .main-header h1 { | |
| font-size: 2rem; | |
| } | |
| } | |
| """ | |
| # Create the enhanced Gradio interface | |
| with gr.Blocks(title="Q&A Assistant", css=custom_css) as demo: | |
| # Header Section | |
| with gr.Row(elem_classes=["main-header"]): | |
| with gr.Column(): | |
| gr.Markdown("# Q&A Assistant") | |
| gr.Markdown("AI-powered interview preparation companion. Upload your PDF and get instant, relevant answers to your queries.") | |
| # Upload Section | |
| with gr.Row(): | |
| with gr.Column(elem_classes=["upload-section"]): | |
| gr.Markdown("### 📁 Document Upload", elem_classes=["section-title"]) | |
| with gr.Row(): | |
| pdf_upload = gr.File( | |
| label="Upload your interview questions PDF", | |
| file_types=[".pdf"], | |
| elem_classes=["file-upload"] | |
| ) | |
| with gr.Row(): | |
| upload_button = gr.Button("Initialize Q&A System", elem_classes=["custom-button"]) | |
| with gr.Row(): | |
| status_text = gr.Textbox( | |
| label="System Status", | |
| value="Upload a PDF to begin", | |
| elem_classes=["status-box"] | |
| ) | |
| # Q&A Section | |
| with gr.Row(): | |
| with gr.Column(elem_classes=["qa-section"]): | |
| gr.Markdown("### 💡 Ask Questions", elem_classes=["section-title"]) | |
| with gr.Row(): | |
| question_input = gr.Textbox( | |
| label="What would you like to know ?", | |
| placeholder="e.g., What are the common behavioral questions?", | |
| lines=2 | |
| ) | |
| with gr.Row(): | |
| submit_button = gr.Button("Get Answer", elem_classes=["custom-button"]) | |
| with gr.Row(): | |
| answer_output = gr.Textbox( | |
| label="Answer", | |
| lines=10, | |
| elem_classes=["answer-box"] | |
| ) | |
| # Information Section | |
| with gr.Row(): | |
| gr.Markdown(""" | |
| <div style="text-align: center; padding: 2rem; color: #64748b; font-size: 0.9rem;"> | |
| Made with ❤️ for interview preparation success | |
| </div> | |
| """) | |
| # Set up events | |
| upload_button.click(upload_file, inputs=pdf_upload, outputs=status_text) | |
| submit_button.click(answer_question, inputs=question_input, outputs=answer_output) | |
| # Launch the app | |
| if __name__ == "__main__": | |
| demo.launch() |