File size: 3,266 Bytes
188fa75
 
 
 
 
 
 
 
 
 
60ff4c1
188fa75
 
60ff4c1
188fa75
 
 
 
 
 
 
60ff4c1
 
 
188fa75
 
 
 
 
 
 
 
60ff4c1
 
 
188fa75
 
60ff4c1
 
 
 
 
188fa75
60ff4c1
 
188fa75
 
 
 
 
 
60ff4c1
 
 
 
 
 
188fa75
60ff4c1
188fa75
 
 
 
60ff4c1
188fa75
 
 
 
 
 
 
 
 
60ff4c1
188fa75
 
60ff4c1
 
 
 
188fa75
60ff4c1
 
 
 
 
 
 
 
 
 
188fa75
 
 
 
 
 
60ff4c1
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import os
from io import BytesIO
import streamlit as st
from PyPDF2 import PdfReader
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from groq import Groq
import tempfile
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

# Initialize Groq API Client
client = Groq(api_key="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")

# Helper Functions
def extract_text_from_pdf(pdf_file):
    """Extract text from uploaded PDF file."""
    reader = PdfReader(pdf_file)
    text = ""
    for page in reader.pages:
        page_text = page.extract_text()
        if page_text:  # Ensure we don't add None
            text += page_text
    return text

def create_chunks(text, chunk_size=500):
    """Chunk the text into smaller pieces for processing."""
    return [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]

def create_embeddings(chunks):
    """Create embeddings for text chunks using SentenceTransformers."""
    if not chunks:
        raise ValueError("No text chunks provided for embedding.")
    
    model = SentenceTransformer("all-MiniLM-L6-v2")
    embeddings = model.encode(chunks)
    
    # Ensure embeddings is 2D even for single chunk
    if len(embeddings.shape) == 1:
        embeddings = np.expand_dims(embeddings, axis=0)
    
    dimension = embeddings.shape[1]
    faiss_index = faiss.IndexFlatL2(dimension)
    faiss_index.add(embeddings.astype('float32'))  # FAISS expects float32
    return faiss_index

def interact_with_model(query, faiss_index, chunks):
    """Interact with the model using a query and FAISS index."""
    model = SentenceTransformer("all-MiniLM-L6-v2")
    query_embedding = model.encode([query])
    
    # Search FAISS index
    distances, indices = faiss_index.search(query_embedding.astype('float32'), k=3)
    
    # Retrieve relevant chunks
    docs = [chunks[i] for i in indices[0] if i < len(chunks)]
    context = " ".join(docs)
    
    messages = [
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": f"Context: {context}\n\n{query}"},
    ]
    
    chat_completion = client.chat.completions.create(
        messages=messages, model="llama-3.3-70b-versatile"
    )
    return chat_completion.choices[0].message.content

# Streamlit Frontend
def main():
    st.title("PDF Query App")
    uploaded_file = st.file_uploader("Upload your PDF file", type=["pdf"])
    
    if uploaded_file is not None:
        text = extract_text_from_pdf(uploaded_file)
        if not text.strip():
            st.error("PDF contains no extractable text. Upload a valid PDF.")
            return
            
        chunks = create_chunks(text)
        if not chunks:
            st.error("No text chunks created. Check PDF content.")
            return
        
        try:
            faiss_index = create_embeddings(chunks)
        except Exception as e:
            st.error(f"Error creating embeddings: {str(e)}")
            return
            
        query = st.text_input("Ask a question about the PDF:")
        if query:
            response = interact_with_model(query, faiss_index, chunks)
            st.write(response)

if __name__ == "__main__":
    main()