File size: 3,532 Bytes
9b255cc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
import gradio as gr
import PyPDF2
import cohere
from pinecone import Pinecone
from sentence_transformers import SentenceTransformer
import io

# Initialize Pinecone and connect to the index
pc = Pinecone(api_key="0f78bc1b-81f7-4a15-9af3-0fbcf0acdb4e")
index = pc.Index("quickstart")

# Load the sentence transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Initialize Cohere with your API key
co = cohere.Client("CxIrucBVA8NNJJOBUnxwRWq488MVydBku1DlqP1u")

def extract_text_from_pdf(pdf_file):
    """Extracts text from the uploaded PDF, with error handling."""
    try:
        if pdf_file is None:
            return "No file uploaded."

        # Read the PDF content from the file path
        with open(pdf_file.name, 'rb') as f:
            pdf_reader = PyPDF2.PdfReader(f)
            text = ""
            for page_num in range(len(pdf_reader.pages)):
                text += pdf_reader.pages[page_num].extract_text() or ""

        if not text.strip():
            return "The uploaded PDF is empty or has no readable content."
        return text

    except PyPDF2.errors.PdfReadError:
        return "The uploaded PDF is encrypted or unreadable."
    except Exception as e:
        return f"Error reading PDF: {str(e)}"

def store_pdf_embeddings(pdf_text):
    """Generate and store embeddings for the uploaded PDF content."""
    segments = [pdf_text[i:i + 512] for i in range(0, len(pdf_text), 512)]
    embeddings = model.encode(segments)
    vectors = [(f"seg-{i}", embed.tolist()) for i, embed in enumerate(embeddings)]
    index.upsert(vectors=vectors)
    return "PDF uploaded and stored successfully!"

def ask_question(query):
    """Handle user questions and generate answers based on the PDF content."""
    query_embedding = model.encode(query).tolist()

    # Retrieve the most relevant segment from Pinecone
    result = index.query(top_k=1, vector=query_embedding)
    retrieved_seg_id = result['matches'][0]['id']
    segment_text = f"Segment: {retrieved_seg_id}"

    # Generate the answer using the retrieved segment as context
    prompt = f"{segment_text}\nQuestion: {query}\nAnswer:"
    response = co.generate(
        model="command-xlarge-nightly",
        prompt=prompt,
        max_tokens=50
    )

    # Return both the segment and the answer
    answer = response.generations[0].text.strip()
    return segment_text, answer

# Gradio Interface Setup
with gr.Blocks() as demo:
    gr.Markdown("# Interactive QA Bot with PDF Support")

    # PDF Upload Section
    pdf_input = gr.File(label="Upload PDF", type="filepath", file_types=[".pdf"])
    upload_status = gr.Textbox(label="Upload Status", interactive=False)
    upload_button = gr.Button("Upload and Store")

    # Handle PDF Upload
    upload_button.click(
        lambda pdf: store_pdf_embeddings(extract_text_from_pdf(pdf))
        if pdf is not None else "Please upload a valid PDF.",
        inputs=pdf_input, outputs=upload_status
    )

    # Question and Answer Section
    query_input = gr.Textbox(label="Enter your question")
    segment_output = gr.Textbox(label="Retrieved Segment", interactive=False)
    answer_output = gr.Textbox(label="Answer", interactive=False)
    query_button = gr.Button("Ask")

    # Handle User Questions
    query_button.click(
        ask_question, inputs=query_input, outputs=[segment_output, answer_output]
    )

demo.launch(share=True)  # Set share=True if you want a public link