Spaces:

NHZ
/

First_Aid_Kit

Sleeping

App Files Files Community

NHZ commited on Jan 4, 2025

Commit

daa4a6a

verified ·

1 Parent(s): 4617265

Update app.py

Browse files

Files changed (1) hide show

app.py +91 -54

app.py CHANGED Viewed

@@ -1,69 +1,106 @@
-import os
-import re
-import torch
 import numpy as np
-from langchain.vectorstores import FAISS
-from langchain.embeddings import HuggingFaceEmbeddings
-from langchain.document_loaders import PyPDFLoader
-from langchain.text_splitter import CharacterTextSplitter
-from langchain.chains.question_answering import load_qa_chain
-from langchain.prompts import PromptTemplate
-from langchain.llms import HuggingFaceHub
 import streamlit as st
-# Environment setup
-HUGGINGFACEHUB_API_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN")
-if not HUGGINGFACEHUB_API_TOKEN:
-    raise ValueError("HuggingFace API Token is missing.")
-# Initialize HuggingFace embeddings model
-embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
-# Load PDF document from Google Drive
-pdf_url = "https://drive.google.com/uc?id=1XvqA1OIssRs2gbmOtKFKj-02yQ5X2yg0"
-loader = PyPDFLoader(pdf_url)
-documents = loader.load()
-# Split text into chunks
-text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
-texts = text_splitter.split_documents(documents)
-# Create FAISS vector database
-db = FAISS.from_documents(texts, embeddings)
-# Initialize HuggingFace LLM (example model, replace as needed)
-llm = HuggingFaceHub(repo_id="bigscience/bloom", model_kwargs={"temperature": 0, "max_length": 512})
-# Define custom prompt
-prompt_template = """
-Use the following pieces of context to answer the question at the end.
-If the question cannot be answered based on the context, say "I don't know."
-Context:
-{context}
-Question:
-{question}
-Answer:
-"""
-prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question"])
-# Load QA chain
-qa_chain = load_qa_chain(llm, chain_type="stuff", prompt=prompt)
-# Streamlit frontend
-st.title("RAG-based Document Q&A")
-st.write("Upload a document and ask questions about it.")
-query = st.text_input("Enter your question:")
-if query:
-    # Search vector database
-    docs = db.similarity_search(query, k=4)
-    # Get relevant context
-    context = "\n\n".join([doc.page_content for doc in docs])
-    # Generate answer using LLM
-    answer = qa_chain.run({"context": context, "question": query})
-    st.write("**Answer:**", answer)

+import requests
 import numpy as np
+import faiss
+from PyPDF2 import PdfReader
+from transformers import AutoTokenizer, AutoModel
+from groq import Groq
 import streamlit as st
+import torch
+import os
+# Initialize Groq client using secret API key
+client = Groq(api_key=os.getenv("GROQ_API_KEY"))
+# Function to download and extract content from a public Google Drive PDF link
+def extract_pdf_content(drive_url):
+    # Extract file ID from the Google Drive URL
+    file_id = drive_url.split("/d/")[1].split("/view")[0]
+    download_url = f"https://drive.google.com/uc?export=download&id={file_id}"
+    # Download the PDF content
+    response = requests.get(download_url)
+    if response.status_code != 200:
+        return None
+    # Save and extract text from the PDF
+    with open("document.pdf", "wb") as f:
+        f.write(response.content)
+    reader = PdfReader("document.pdf")
+    text = ""
+    for page in reader.pages:
+        text += page.extract_text()
+    return text
+# Function to chunk and tokenize text
+def chunk_and_tokenize(text, tokenizer, chunk_size=512):
+    tokens = tokenizer.encode(text, add_special_tokens=False)
+    chunks = [tokens[i:i + chunk_size] for i in range(0, len(tokens), chunk_size)]
+    return chunks
+# Function to compute embeddings and build FAISS index
+def build_faiss_index(chunks, model):
+    embeddings = []
+    for chunk in chunks:
+        input_ids = torch.tensor([chunk])
+        with torch.no_grad():
+            embedding = model(input_ids).last_hidden_state.mean(dim=1).detach().numpy()
+        embeddings.append(embedding)
+    embeddings = np.vstack(embeddings)
+    index = faiss.IndexFlatL2(embeddings.shape[1])
+    index.add(embeddings)
+    return index
+# Streamlit app
+st.title("RAG-based Application with Groq API")
+# Predefined Google Drive link
+drive_url = "https://drive.google.com/file/d/1XvqA1OIssRs2gbmOtKFKj-02yQ5X2yg0/view?usp=sharing"
+# Extract document content
+st.write("Extracting content from the document...")
+text = extract_pdf_content(drive_url)
+if text:
+    st.write("Document extracted successfully!")
+    # Initialize tokenizer and model
+    tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
+    model = AutoModel.from_pretrained("bert-base-uncased")
+    st.write("Chunking and tokenizing content...")
+    chunks = chunk_and_tokenize(text, tokenizer)
+    st.write("Building FAISS index...")
+    index = build_faiss_index(chunks, model)
+    # Query input
+    query = st.text_input("Enter your query:")
+    if query:
+        st.write("Searching for the most relevant chunk...")
+        query_tokens = tokenizer.encode(query, add_special_tokens=False)
+        query_embedding = (
+            model(torch.tensor([query_tokens]))
+            .last_hidden_state.mean(dim=1)
+            .detach().numpy()
+        )
+        _, indices = index.search(query_embedding, k=1)
+        # Retrieve the most relevant chunk
+        relevant_chunk = chunks[indices[0][0]]
+        relevant_text = tokenizer.decode(relevant_chunk)
+        st.write("Relevant chunk found:", relevant_text)
+        # Interact with Groq API
+        st.write("Querying the Groq API...")
+        chat_completion = client.chat.completions.create(
+            messages=[
+                {
+                    "role": "user",
+                    "content": relevant_text,
+                }
+            ],
+            model="llama-3.3-70b-versatile",
+        )
+        st.write("Model Response:", chat_completion.choices[0].message.content)
+else:
+    st.error("Failed to extract content from the document.")