File size: 2,147 Bytes
03b2846
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import gradio as gr
from pypdf import PdfReader
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
from transformers import pipeline

# Load embedding model
embed_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

# Load QA model
qa_model = pipeline("text-generation", model="gpt2")

# Temporary in-memory storage
documents = []
vectors = None
index = None

def read_pdfs(pdf_files):
    global documents, vectors, index

    all_text = ""
    documents = []

    for pdf in pdf_files:
        reader = PdfReader(pdf.name)
        text = ""
        for page in reader.pages:
            text += page.extract_text() + "\n"
        documents.append(text)
        all_text += text + "\n"

    # Split text into chunks
    chunks = all_text.split("\n")

    # Embed chunks
    embeddings = embed_model.encode(chunks)
    vectors = np.array(embeddings).astype("float32")

    # Create FAISS Index
    index = faiss.IndexFlatL2(vectors.shape[1])
    index.add(vectors)

    return "Documents uploaded and processed. You may now ask questions."

def ask_question(query):
    global vectors, index, documents

    if index is None:
        return "Please upload PDF documents first."

    # Embed query
    q_embed = embed_model.encode([query]).astype("float32")

    # Search similar chunks
    D, I = index.search(q_embed, k=3)

    # Collect top matches
    context = ""
    for idx in I[0]:
        context += documents[0][idx: idx + 500] + "\n"

    # Generate answer
    prompt = f"Context: {context}\nQuestion: {query}\nAnswer:"
    answer = qa_model(prompt, max_length=120)[0]["generated_text"]

    return answer

# Gradio UI
with gr.Blocks() as demo:
    gr.Markdown("## PDF Chatbot")
    pdf_input = gr.File(label="Upload multiple PDFs", file_count="multiple")
    upload_btn = gr.Button("Process Documents")
    status = gr.Textbox(label="Status")

    question = gr.Textbox(label="Ask a question")
    answer = gr.Textbox(label="Answer")

    upload_btn.click(read_pdfs, inputs=pdf_input, outputs=status)
    question.submit(ask_question, inputs=question, outputs=answer)

demo.launch()