Spaces:
Sleeping
Sleeping
File size: 4,606 Bytes
7fdfc68 cf68bef 7fdfc68 cf68bef 7fdfc68 cf68bef 7fdfc68 cf68bef 7fdfc68 cf68bef 7fdfc68 cf68bef 7fdfc68 cf68bef 7fdfc68 cf68bef 7fdfc68 cf68bef 7fdfc68 cf68bef 7fdfc68 cf68bef 7fdfc68 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 |
import streamlit as st
import chromadb
import torch
from transformers import pipeline, AutoModel, AutoTokenizer
import numpy as np
from PyPDF2 import PdfReader
import os
# Load sentence transformer model for embeddings
def load_embedding_model():
model = AutoModel.from_pretrained("cross-encoder/qnli-electra-base")
tokenizer = AutoTokenizer.from_pretrained("cross-encoder/qnli-electra-base")
return model, tokenizer
# Generate embeddings for text
def generate_embedding(model, tokenizer, text):
# Tokenize the text
inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
# Generate embeddings
with torch.no_grad():
outputs = model(**inputs)
# Use the last hidden state as embedding
embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
return embeddings
# Initialize Hugging Face pipeline for question answering
def load_qa_pipeline():
return pipeline("question-answering", model="deepset/roberta-base-squad2")
# Extract text from PDF
def extract_pdf_text(pdf_file):
reader = PdfReader(pdf_file)
text = ""
for page in reader.pages:
text += page.extract_text() + "\n"
return text
# Split text into chunks
def split_text_into_chunks(text, chunk_size=500, overlap=100):
chunks = []
for i in range(0, len(text), chunk_size - overlap):
chunks.append(text[i:i+chunk_size])
return chunks
# Create ChromaDB collection with embeddings
def create_chroma_collection(chunks, model, tokenizer):
# Use persistent client to avoid memory issues
client = chromadb.PersistentClient(path="./chroma_db")
# Create a unique collection name
collection_name = f"pdf_qa_collection_{int(torch.rand(1).item() * 10000)}"
# Create collection
collection = client.create_collection(name=collection_name)
# Add chunks to collection with embeddings
for i, chunk in enumerate(chunks):
# Generate embedding for the chunk
embedding = generate_embedding(model, tokenizer, chunk)
collection.add(
ids=[f"chunk_{i}"],
documents=[chunk],
embeddings=[embedding.tolist()]
)
return client, collection, collection_name
# Retrieve most relevant context
def retrieve_context(collection, question, model, tokenizer, top_k=3):
# Generate embedding for the question
question_embedding = generate_embedding(model, tokenizer, question)
# Query the collection
results = collection.query(
query_embeddings=[question_embedding.tolist()],
n_results=top_k
)
return results['documents'][0]
# Main Streamlit app
def main():
st.title("PDF Question Answering App")
# Load embedding model
embedding_model, tokenizer = load_embedding_model()
# File uploader
uploaded_file = st.file_uploader("Upload PDF", type=['pdf'])
# Question input
question = st.text_input("Enter your question")
# Run button
if st.button("Get Answer"):
if uploaded_file and question:
try:
# Load QA pipeline
qa_pipeline = load_qa_pipeline()
# Extract PDF text
pdf_text = extract_pdf_text(uploaded_file)
# Split text into chunks
text_chunks = split_text_into_chunks(pdf_text)
# Create ChromaDB collection with embeddings
client, collection, collection_name = create_chroma_collection(
text_chunks, embedding_model, tokenizer
)
# Retrieve context
contexts = retrieve_context(
collection, question, embedding_model, tokenizer
)
# Prepare answers
answers = []
for context in contexts:
result = qa_pipeline(question=question, context=context)
answers.append(result)
# Display best answer
best_answer = max(answers, key=lambda x: x['score'])
st.write("Answer:", best_answer['answer'])
st.write("Confidence Score:", best_answer['score'])
# Clean up ChromaDB collection
client.delete_collection(name=collection_name)
except Exception as e:
st.error(f"An error occurred: {e}")
if __name__ == "__main__":
main() |