import gradio as gr import PyPDF2 import cohere from pinecone import Pinecone from sentence_transformers import SentenceTransformer import io # Initialize Pinecone and connect to the index pc = Pinecone(api_key="0f78bc1b-81f7-4a15-9af3-0fbcf0acdb4e") index = pc.Index("quickstart") # Load the sentence transformer model model = SentenceTransformer('all-MiniLM-L6-v2') # Initialize Cohere with your API key co = cohere.Client("CxIrucBVA8NNJJOBUnxwRWq488MVydBku1DlqP1u") def extract_text_from_pdf(pdf_file): """Extracts text from the uploaded PDF, with error handling.""" try: if pdf_file is None: return "No file uploaded." # Read the PDF content from the file path with open(pdf_file.name, 'rb') as f: pdf_reader = PyPDF2.PdfReader(f) text = "" for page_num in range(len(pdf_reader.pages)): text += pdf_reader.pages[page_num].extract_text() or "" if not text.strip(): return "The uploaded PDF is empty or has no readable content." return text except PyPDF2.errors.PdfReadError: return "The uploaded PDF is encrypted or unreadable." except Exception as e: return f"Error reading PDF: {str(e)}" def store_pdf_embeddings(pdf_text): """Generate and store embeddings for the uploaded PDF content.""" segments = [pdf_text[i:i + 512] for i in range(0, len(pdf_text), 512)] embeddings = model.encode(segments) vectors = [(f"seg-{i}", embed.tolist()) for i, embed in enumerate(embeddings)] index.upsert(vectors=vectors) return "PDF uploaded and stored successfully!" def ask_question(query): """Handle user questions and generate answers based on the PDF content.""" query_embedding = model.encode(query).tolist() # Retrieve the most relevant segment from Pinecone result = index.query(top_k=1, vector=query_embedding) retrieved_seg_id = result['matches'][0]['id'] segment_text = f"Segment: {retrieved_seg_id}" # Generate the answer using the retrieved segment as context prompt = f"{segment_text}\nQuestion: {query}\nAnswer:" response = co.generate( model="command-xlarge-nightly", prompt=prompt, max_tokens=50 ) # Return both the segment and the answer answer = response.generations[0].text.strip() return segment_text, answer # Gradio Interface Setup with gr.Blocks() as demo: gr.Markdown("# Interactive QA Bot with PDF Support") # PDF Upload Section pdf_input = gr.File(label="Upload PDF", type="filepath", file_types=[".pdf"]) upload_status = gr.Textbox(label="Upload Status", interactive=False) upload_button = gr.Button("Upload and Store") # Handle PDF Upload upload_button.click( lambda pdf: store_pdf_embeddings(extract_text_from_pdf(pdf)) if pdf is not None else "Please upload a valid PDF.", inputs=pdf_input, outputs=upload_status ) # Question and Answer Section query_input = gr.Textbox(label="Enter your question") segment_output = gr.Textbox(label="Retrieved Segment", interactive=False) answer_output = gr.Textbox(label="Answer", interactive=False) query_button = gr.Button("Ask") # Handle User Questions query_button.click( ask_question, inputs=query_input, outputs=[segment_output, answer_output] ) demo.launch(share=True) # Set share=True if you want a public link