Spaces:

NHZ
/

First_Aid_Kit

Sleeping

App Files Files Community

NHZ commited on Jan 4, 2025

Commit

d386915

verified ·

1 Parent(s): 75f7375

Update app.py

Browse files

Files changed (1) hide show

app.py +84 -64

app.py CHANGED Viewed

@@ -1,87 +1,107 @@
-import os
-import streamlit as st
-import PyPDF2
 import requests
-from sentence_transformers import SentenceTransformer
 import faiss
 from groq import Groq
-# Initialize Groq client using the secret environment variable
 client = Groq(api_key=os.getenv("GROQ_API_KEY"))
-# Function to download and read PDF content
-def extract_text_from_google_drive():
-    link = "https://drive.google.com/uc?id=1XvqA1OIssRs2gbmOtKFKj-02yQ5X2yg0"
-    response = requests.get(link)
-    with open("document.pdf", "wb") as file:
-        file.write(response.content)
-    with open("document.pdf", "rb") as file:
-        reader = PyPDF2.PdfReader(file)
-        text = " ".join([page.extract_text() for page in reader.pages])
     return text
-# Function to chunk text
-def chunk_text(text, max_length=500):
-    sentences = text.split(". ")
-    chunks = []
-    chunk = ""
-    for sentence in sentences:
-        if len(chunk) + len(sentence) <= max_length:
-            chunk += sentence + ". "
-        else:
-            chunks.append(chunk.strip())
-            chunk = sentence + ". "
-    if chunk:
-        chunks.append(chunk.strip())
     return chunks
-# Function to create FAISS index
-def create_faiss_index(chunks, model):
-    embeddings = model.encode(chunks)
-    dimension = len(embeddings[0])
-    index = faiss.IndexFlatL2(dimension)
-    index.add(embeddings)
-    return index, chunks
-# Function to query Groq API
-def query_groq(question, model_name="llama-3.3-70b-versatile"):
-    chat_completion = client.chat.completions.create(
-        messages=[{"role": "user", "content": question}],
-        model=model_name,
-    )
-    return chat_completion.choices[0].message.content
 # Streamlit app
-def main():
-    st.title("RAG-based Application with Groq API")
-    st.subheader("Query the document stored on Google Drive")
-    st.write("Extracting text from the document...")
-    text = extract_text_from_google_drive()
-    st.write("Document text extracted successfully!")
-    st.write("Chunking and embedding text...")
-    model = SentenceTransformer("all-MiniLM-L6-v2")
-    chunks = chunk_text(text)
-    index, chunks = create_faiss_index(chunks, model)
-    st.write(f"Created FAISS index with {len(chunks)} chunks.")
     # Query input
-    question = st.text_input("Ask a question based on the document:")
-    if question:
-        st.write("Searching for relevant chunks...")
-        question_embedding = model.encode([question])
-        _, indices = index.search(question_embedding, k=1)
         relevant_chunk = chunks[indices[0][0]]
-        st.write("Generating answer using Groq API...")
-        answer = query_groq(relevant_chunk)
-        st.write("### Answer:")
-        st.write(answer)
-if __name__ == "__main__":
-    main()

 import requests
+import numpy as np
 import faiss
+from PyPDF2 import PdfReader
+from transformers import AutoTokenizer, AutoModel
 from groq import Groq
+import streamlit as st
+import torch
+import os
+# Initialize Groq client using secret API key
 client = Groq(api_key=os.getenv("GROQ_API_KEY"))
+# Function to download and extract content from a public Google Drive PDF link
+def extract_pdf_content(drive_url):
+    # Extract file ID from the Google Drive URL
+    file_id = drive_url.split("/d/")[1].split("/view")[0]
+    download_url = f"https://drive.google.com/uc?export=download&id={file_id}"
+    # Download the PDF content
+    response = requests.get(download_url)
+    if response.status_code != 200:
+        return None
+    # Save and extract text from the PDF
+    with open("document.pdf", "wb") as f:
+        f.write(response.content)
+    reader = PdfReader("document.pdf")
+    text = ""
+    for page in reader.pages:
+        text += page.extract_text()
     return text
+# Function to chunk and tokenize text
+def chunk_and_tokenize(text, tokenizer, chunk_size=512):
+    tokens = tokenizer.encode(text, add_special_tokens=False)
+    chunks = [tokens[i:i + chunk_size] for i in range(0, len(tokens), chunk_size)]
     return chunks
+# Function to compute embeddings and build FAISS index
+def build_faiss_index(chunks, model):
+    embeddings = []
+    for chunk in chunks:
+        input_ids = torch.tensor([chunk])
+        with torch.no_grad():
+            embedding = model(input_ids).last_hidden_state.mean(dim=1).numpy()
+        embeddings.append(embedding)
+    embeddings = np.vstack(embeddings)
+    index = faiss.IndexFlatL2(embeddings.shape[1])
+    index.add(embeddings)
+    return index
 # Streamlit app
+st.title("RAG-based Application with Groq API")
+# Predefined Google Drive link
+drive_url = "https://drive.google.com/file/d/1XvqA1OIssRs2gbmOtKFKj-02yQ5X2yg0/view?usp=sharing"
+# Extract document content
+st.write("Extracting content from the document...")
+text = extract_pdf_content(drive_url)
+if text:
+    st.write("Document extracted successfully!")
+    # Initialize tokenizer and model
+    tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
+    model = AutoModel.from_pretrained("bert-base-uncased")
+    st.write("Chunking and tokenizing content...")
+    chunks = chunk_and_tokenize(text, tokenizer)
+    st.write("Building FAISS index...")
+    index = build_faiss_index(chunks, model)
     # Query input
+    query = st.text_input("Enter your query:")
+    if query:
+        st.write("Searching for the most relevant chunk...")
+        query_tokens = tokenizer.encode(query, add_special_tokens=False)
+        query_embedding = (
+            model(torch.tensor([query_tokens])).last_hidden_state.mean(dim=1).numpy()
+        )
+        _, indices = index.search(query_embedding, k=1)
+        # Retrieve the most relevant chunk
         relevant_chunk = chunks[indices[0][0]]
+        relevant_text = tokenizer.decode(relevant_chunk)
+        st.write("Relevant chunk found:", relevant_text)
+        # Interact with Groq API
+        st.write("Querying the Groq API...")
+        chat_completion = client.chat.completions.create(
+            messages=[
+                {
+                    "role": "user",
+                    "content": relevant_text,
+                }
+            ],
+            model="llama-3.3-70b-versatile",
+        )
+        st.write("Model Response:", chat_completion.choices[0].message.content)
+else:
+    st.error("Failed to extract content from the document.")