File size: 4,606 Bytes
7fdfc68
 
 
cf68bef
 
7fdfc68
 
 
cf68bef
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7fdfc68
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cf68bef
 
7fdfc68
 
 
 
 
 
 
 
 
cf68bef
7fdfc68
cf68bef
 
 
7fdfc68
 
cf68bef
 
7fdfc68
 
 
 
 
cf68bef
 
 
 
 
7fdfc68
cf68bef
7fdfc68
 
 
 
 
 
 
 
cf68bef
 
 
7fdfc68
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cf68bef
 
 
 
7fdfc68
 
cf68bef
 
 
7fdfc68
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
import streamlit as st
import chromadb
import torch
from transformers import pipeline, AutoModel, AutoTokenizer
import numpy as np
from PyPDF2 import PdfReader
import os

# Load sentence transformer model for embeddings
def load_embedding_model():
    model = AutoModel.from_pretrained("cross-encoder/qnli-electra-base")
    tokenizer = AutoTokenizer.from_pretrained("cross-encoder/qnli-electra-base")
    return model, tokenizer

# Generate embeddings for text
def generate_embedding(model, tokenizer, text):
    # Tokenize the text
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
    
    # Generate embeddings
    with torch.no_grad():
        outputs = model(**inputs)
        
    # Use the last hidden state as embedding
    embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
    return embeddings

# Initialize Hugging Face pipeline for question answering
def load_qa_pipeline():
    return pipeline("question-answering", model="deepset/roberta-base-squad2")

# Extract text from PDF
def extract_pdf_text(pdf_file):
    reader = PdfReader(pdf_file)
    text = ""
    for page in reader.pages:
        text += page.extract_text() + "\n"
    return text

# Split text into chunks
def split_text_into_chunks(text, chunk_size=500, overlap=100):
    chunks = []
    for i in range(0, len(text), chunk_size - overlap):
        chunks.append(text[i:i+chunk_size])
    return chunks

# Create ChromaDB collection with embeddings
def create_chroma_collection(chunks, model, tokenizer):
    # Use persistent client to avoid memory issues
    client = chromadb.PersistentClient(path="./chroma_db")
    
    # Create a unique collection name
    collection_name = f"pdf_qa_collection_{int(torch.rand(1).item() * 10000)}"
    
    # Create collection
    collection = client.create_collection(name=collection_name)
    
    # Add chunks to collection with embeddings
    for i, chunk in enumerate(chunks):
        # Generate embedding for the chunk
        embedding = generate_embedding(model, tokenizer, chunk)
        
        collection.add(
            ids=[f"chunk_{i}"],
            documents=[chunk],
            embeddings=[embedding.tolist()]
        )
    
    return client, collection, collection_name

# Retrieve most relevant context
def retrieve_context(collection, question, model, tokenizer, top_k=3):
    # Generate embedding for the question
    question_embedding = generate_embedding(model, tokenizer, question)
    
    # Query the collection
    results = collection.query(
        query_embeddings=[question_embedding.tolist()],
        n_results=top_k
    )
    return results['documents'][0]

# Main Streamlit app
def main():
    st.title("PDF Question Answering App")
    
    # Load embedding model
    embedding_model, tokenizer = load_embedding_model()
    
    # File uploader
    uploaded_file = st.file_uploader("Upload PDF", type=['pdf'])
    
    # Question input
    question = st.text_input("Enter your question")
    
    # Run button
    if st.button("Get Answer"):
        if uploaded_file and question:
            try:
                # Load QA pipeline
                qa_pipeline = load_qa_pipeline()
                
                # Extract PDF text
                pdf_text = extract_pdf_text(uploaded_file)
                
                # Split text into chunks
                text_chunks = split_text_into_chunks(pdf_text)
                
                # Create ChromaDB collection with embeddings
                client, collection, collection_name = create_chroma_collection(
                    text_chunks, embedding_model, tokenizer
                )
                
                # Retrieve context
                contexts = retrieve_context(
                    collection, question, embedding_model, tokenizer
                )
                
                # Prepare answers
                answers = []
                for context in contexts:
                    result = qa_pipeline(question=question, context=context)
                    answers.append(result)
                
                # Display best answer
                best_answer = max(answers, key=lambda x: x['score'])
                st.write("Answer:", best_answer['answer'])
                st.write("Confidence Score:", best_answer['score'])
                
                # Clean up ChromaDB collection
                client.delete_collection(name=collection_name)
                
            except Exception as e:
                st.error(f"An error occurred: {e}")

if __name__ == "__main__":
    main()