Spaces:

amasood
/

myRAG

Build error

App Files Files Community

amasood commited on Dec 29, 2024

Commit

86cad8f

verified ·

1 Parent(s): 8fbc00e

Update app.py

Browse files

Files changed (1) hide show

app.py +73 -91

app.py CHANGED Viewed

@@ -1,120 +1,102 @@
 import os
-import streamlit as st
-import PyPDF2
 import torch
-from transformers import GPT2LMHeadModel, GPT2Tokenizer
 import faiss
-import numpy as np
-# Load GPT-2 Model and Tokenizer
-device = "cuda" if torch.cuda.is_available() else "cpu"
-model_name = "gpt2"
-tokenizer = GPT2Tokenizer.from_pretrained(model_name)
-model = GPT2LMHeadModel.from_pretrained(model_name).to(device)
-# Set pad_token to eos_token
-tokenizer.pad_token = tokenizer.eos_token
-# Sidebar for file upload
-st.sidebar.title("Upload PDFs")
-uploaded_files = st.sidebar.file_uploader("Upload one or more PDF files", accept_multiple_files=True, type=["pdf"])
-# Process PDF files
-def extract_text_from_pdf(pdf_files):
     text_data = []
-    for file in pdf_files:
-        pdf_reader = PyPDF2.PdfReader(file)
         text = ""
-        for page in pdf_reader.pages:
-            text += page.extract_text()
         text_data.append(text)
     return text_data
-# Create FAISS index
 def create_faiss_index(text_data):
-    """
-    Creates a FAISS index from the text data.
-    """
-    # Enable hidden states in the model configuration
-    model.config.output_hidden_states = True
-    # Initialize FAISS index
-    dim = model.config.hidden_size  # GPT-2 hidden size
-    index = faiss.IndexFlatL2(dim)
     embeddings = []
     for text in text_data:
-        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(device)
         with torch.no_grad():
-            outputs = model(**inputs)
-            # Extract the last layer's hidden state
-            hidden_states = outputs.hidden_states[-1]
-            embedding = hidden_states.mean(dim=1).cpu().numpy()
-            embeddings.append(embedding)
-            index.add(embedding)
     return index, embeddings
-# Answer queries
-def answer_query(query, index, embeddings, text_data):
-    """
-    Answers a query based on the FAISS index and text data.
-    """
-    # Check if FAISS index is populated
-    if index.ntotal == 0:
-        raise ValueError("The FAISS index is empty. Please upload documents to populate the database.")
-    # Enable hidden states in the model configuration
-    model.config.output_hidden_states = True
-    # Tokenize the query
-    inputs = tokenizer(query, return_tensors="pt", truncation=True, padding=True).to(device)
     with torch.no_grad():
-        outputs = model(**inputs)
         query_embedding = outputs.hidden_states[-1].mean(dim=1).cpu().numpy()
-    # Search for the nearest neighbor in the FAISS index
     _, indices = index.search(query_embedding, k=1)
-    if len(indices) == 0 or indices[0][0] < 0:
-        raise ValueError("No relevant context found for the given query.")
     nearest_index = indices[0][0]
-    # Ensure text data size matches the FAISS index
-    if nearest_index >= len(text_data):
-        raise IndexError("Index out of range in text data. Please ensure data alignment.")
-    # Retrieve the most relevant text
     relevant_text = text_data[nearest_index]
-    # Generate an answer using the model
     input_text = f"Context: {relevant_text}\nQuestion: {query}\nAnswer:"
-    inputs = tokenizer(input_text, return_tensors="pt", truncation=True, padding=True).to(device)
     with torch.no_grad():
         outputs = model.generate(**inputs, max_new_tokens=200)
-    # Decode the generated answer
-    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
-    return answer
-# Main app
-if uploaded_files:
-    st.title("RAG Query Application")
-    text_data = extract_text_from_pdf(uploaded_files)
-    index, embeddings = create_faiss_index(text_data)
-    query = st.text_input("Enter your query:")
-    if query:
-        with st.spinner("Fetching answer..."):
-            answer = answer_query(query, index, embeddings, text_data)
-        st.success(answer)
-else:
-    st.title("Upload PDFs to Build RAG Database")
-    st.write("Please upload one or more PDF files using the sidebar to start.")

 import os
 import torch
 import faiss
+from PyPDF2 import PdfReader
+from transformers import GPT2Tokenizer, GPT2LMHeadModel
+import streamlit as st
+# Device setup
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+# Load GPT-2 model and tokenizer
+@st.cache_resource
+def load_model_and_tokenizer():
+    tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
+    model = GPT2LMHeadModel.from_pretrained("gpt2").to(device)
+    tokenizer.pad_token = tokenizer.eos_token  # Set padding token
+    return model, tokenizer
+model, tokenizer = load_model_and_tokenizer()
+# Function to extract text from uploaded PDFs
+def extract_text_from_pdfs(uploaded_files):
     text_data = []
+    for file in uploaded_files:
+        reader = PdfReader(file)
         text = ""
+        for page in reader.pages:
+            text += page.extract_text() or ""
         text_data.append(text)
     return text_data
+# Function to create a FAISS index
 def create_faiss_index(text_data):
     embeddings = []
     for text in text_data:
+        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=1024).to(device)
         with torch.no_grad():
+            outputs = model(**inputs, output_hidden_states=True)
+            embeddings.append(outputs.hidden_states[-1].mean(dim=1).cpu().numpy())
+    embeddings = torch.cat([torch.tensor(embed) for embed in embeddings], dim=0).numpy()
+    dimension = embeddings.shape[1]
+    index = faiss.IndexFlatL2(dimension)
+    index.add(embeddings)
     return index, embeddings
+# Function to answer queries
+def answer_query(query, index, text_data):
+    inputs = tokenizer(query, return_tensors="pt", truncation=True, padding=True, max_length=1024).to(device)
     with torch.no_grad():
+        outputs = model(**inputs, output_hidden_states=True)
         query_embedding = outputs.hidden_states[-1].mean(dim=1).cpu().numpy()
     _, indices = index.search(query_embedding, k=1)
     nearest_index = indices[0][0]
     relevant_text = text_data[nearest_index]
     input_text = f"Context: {relevant_text}\nQuestion: {query}\nAnswer:"
+    inputs = tokenizer(input_text, return_tensors="pt", truncation=True, padding=True, max_length=1024).to(device)
     with torch.no_grad():
         outputs = model.generate(**inputs, max_new_tokens=200)
+    return tokenizer.decode(outputs[0], skip_special_tokens=True)
+# Streamlit UI
+st.title("RAG App with GPT-2")
+st.write("Upload PDF files to build a database and ask questions!")
+# Upload PDF files
+uploaded_files = st.file_uploader("Upload PDF files", type=["pdf"], accept_multiple_files=True)
+# Build database
+if st.button("Build Database") and uploaded_files:
+    with st.spinner("Processing files..."):
+        text_data = extract_text_from_pdfs(uploaded_files)
+        index, _ = create_faiss_index(text_data)
+        # Save the index and text data
+        faiss.write_index(index, "faiss_index.bin")
+        with open("text_data.txt", "w") as f:
+            for text in text_data:
+                f.write(text + "\n")
+        st.success("Database built successfully!")
+# Load existing database
+if os.path.exists("faiss_index.bin") and os.path.exists("text_data.txt"):
+    with st.spinner("Loading existing database..."):
+        index = faiss.read_index("faiss_index.bin")
+        with open("text_data.txt", "r") as f:
+            text_data = f.readlines()
+    st.success("Database loaded successfully!")
+# Query input
+query = st.text_input("Enter your query:")
+# Get answer
+if st.button("Get Answer") and query:
+    with st.spinner("Searching and generating answer..."):
+        try:
+            answer = answer_query(query, index, text_data)
+            st.success("Answer generated successfully!")
+            st.write(answer)
+        except Exception as e:
+            st.error(f"Error: {e}")