Spaces:

NHZ
/

First_Aid_Kit

Sleeping

App Files Files Community

NHZ commited on Jan 5, 2025

Commit

636755b

verified ·

1 Parent(s): b891ccb

Update app.py

Browse files

Files changed (1) hide show

app.py +102 -118

app.py CHANGED Viewed

@@ -1,122 +1,106 @@
 import os
-import requests
-import torch
-from transformers import AutoTokenizer, AutoModel
-from PyPDF2 import PdfReader
-from langchain.vectorstores import FAISS
-from langchain.chains import RetrievalQA
-from langchain.prompts import PromptTemplate
-from langchain.llms.base import LLM
-from pydantic import Field
-from typing import Optional, List
 import streamlit as st
-# Custom wrapper for Groq API
-class GroqLLM(LLM):
-    api_key: str = Field(..., description="API key for Groq")
-    model: str = "llama-3.3-70b-versatile"
-    @property
-    def _llm_type(self) -> str:
-        return "groq"
-    def _call(self, prompt: str, stop: Optional[List[str]] = None) -> str:
-        headers = {"Authorization": f"Bearer {self.api_key}"}
-        json_data = {
-            "model": self.model,
-            "messages": [{"role": "user", "content": prompt}],
-        }
-        response = requests.post(
-            "https://api.groq.com/v1/chat/completions", headers=headers, json=json_data
-        )
-        if response.status_code != 200:
-            raise ValueError(f"Groq API call failed: {response.status_code}, {response.text}")
-        data = response.json()
-        return data["choices"][0]["message"]["content"]
-# Initialize Groq API LLM
-llm = GroqLLM(api_key="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")
-# Function to extract content from a public Google Drive PDF link
-def extract_pdf_content(drive_url):
-    file_id = drive_url.split("/d/")[1].split("/view")[0]
-    download_url = f"https://drive.google.com/uc?export=download&id={file_id}"
-    response = requests.get(download_url)
-    if response.status_code != 200:
-        return None
-    with open("document.pdf", "wb") as f:
-        f.write(response.content)
-    reader = PdfReader("document.pdf")
-    text = ""
-    for page in reader.pages:
-        text += page.extract_text()
     return text
-# Function to create a FAISS vector store
-def create_vector_store(text):
-    # Split the text into sentences and clean it
-    sentences = [sentence.strip() for sentence in text.split(". ") if sentence.strip()]
-    # Load the model and tokenizer from Hugging Face
-    model_name = "sentence-transformers/all-MiniLM-L6-v2"
-    tokenizer = AutoTokenizer.from_pretrained(model_name)
-    model = AutoModel.from_pretrained(model_name)
-    def embed(sentence):
-        tokens = tokenizer(sentence, return_tensors="pt", truncation=True, padding=True)
-        with torch.no_grad():
-            embeddings = model(**tokens).last_hidden_state.mean(dim=1).squeeze().numpy()
-        return embeddings
-    # Create a FAISS vector store
-    vector_store = FAISS.from_texts(
-        texts=sentences, embedding=lambda x: embed(x)
-    )
-    return vector_store, sentences
-# Streamlit app
-st.title("RAG-based Application with Focused Context")
-drive_url = "https://drive.google.com/file/d/1XvqA1OIssRs2gbmOtKFKj-02yQ5X2yg0/view?usp=sharing"
-text = extract_pdf_content(drive_url)
-if text:
-    st.write("Document extracted successfully!")
-    vector_store, sentences = create_vector_store(text)
-    st.write("Vector store created!")
-    query = st.text_input("Enter your query:")
-    if query:
-        retriever = vector_store.as_retriever()
-        retriever.search_kwargs["k"] = 3
-        prompt_template = PromptTemplate(
-            template="""
-            Use the following context to answer the question:
-            {context}
-            Question: {question}
-            Answer:""",
-            input_variables=["context", "question"]
-        )
-        qa_chain = RetrievalQA.from_chain_type(
-            retriever=retriever,
-            llm=llm,
-            chain_type="stuff",
-            return_source_documents=True
-        )
-        response = qa_chain({"query": query})
-        answer = response["result"]
-        st.write("Answer:", answer)
-else:
-    st.error("Failed to extract content from the document.")

 import os
 import streamlit as st
+import requests
+import PyPDF2
+from sentence_transformers import SentenceTransformer
+import faiss
+import nltk
+from groq import Groq
+nltk.download('punkt')
+# Initialize Groq client
+client = Groq(api_key=os.getenv("GROQ_API_KEY"))
+# Function to extract text from a PDF
+def extract_text_from_pdf(pdf_url):
+    # Convert Google Drive shareable link to direct download link
+    direct_url = pdf_url.replace("/view?usp=sharing", "").replace("file/d/", "uc?id=")
+    response = requests.get(direct_url)
+    pdf_content = response.content
+    with open("temp.pdf", "wb") as f:
+        f.write(pdf_content)
+    # Read the PDF content
+    with open("temp.pdf", "rb") as f:
+        reader = PyPDF2.PdfReader(f)
+        text = ""
+        for page in reader.pages:
+            text += page.extract_text()
+    os.remove("temp.pdf")
     return text
+# Function to chunk text
+def chunk_text(text, chunk_size=300):
+    sentences = nltk.sent_tokenize(text)
+    chunks = []
+    current_chunk = []
+    current_length = 0
+    for sentence in sentences:
+        current_length += len(sentence.split())
+        if current_length <= chunk_size:
+            current_chunk.append(sentence)
+        else:
+            chunks.append(" ".join(current_chunk))
+            current_chunk = [sentence]
+            current_length = len(sentence.split())
+    if current_chunk:
+        chunks.append(" ".join(current_chunk))
+    return chunks
+# Function to create embeddings and store them in FAISS
+def create_faiss_index(chunks):
+    model = SentenceTransformer("all-MiniLM-L6-v2")
+    embeddings = model.encode(chunks)
+    dimension = embeddings.shape[1]
+    index = faiss.IndexFlatL2(dimension)
+    index.add(embeddings)
+    return index, embeddings
+# Function to query FAISS
+def query_faiss(index, query, chunks, model):
+    query_vector = model.encode([query])
+    distances, indices = index.search(query_vector, k=3)
+    results = [chunks[i] for i in indices[0]]
+    return results
+# Main Streamlit App
+def main():
+    st.title("RAG-based Application")
+    st.write("Interact with your document using Groq-powered model.")
+    # Pre-defined document link
+    doc_link = "https://drive.google.com/file/d/1XvqA1OIssRs2gbmOtKFKj-02yQ5X2yg0/view?usp=sharing"
+    # Extract Document Content
+    if "document_text" not in st.session_state:
+        st.write("Extracting document content...")
+        text = extract_text_from_pdf(doc_link)
+        st.session_state['document_text'] = text
+        st.success("Document content extracted!")
+    # Process Document and Create FAISS Index
+    if 'document_text' in st.session_state and "faiss_index" not in st.session_state:
+        st.write("Processing document...")
+        chunks = chunk_text(st.session_state['document_text'])
+        index, embeddings = create_faiss_index(chunks)
+        st.session_state['faiss_index'] = index
+        st.session_state['chunks'] = chunks
+        st.session_state['model'] = SentenceTransformer("all-MiniLM-L6-v2")
+        st.success(f"Document processed into {len(chunks)} chunks!")
+    # Query the Document
+    if 'faiss_index' in st.session_state:
+        st.header("Ask Questions")
+        query = st.text_input("Enter your question here")
+        if st.button("Query Document"):
+            results = query_faiss(st.session_state['faiss_index'], query, st.session_state['chunks'], st.session_state['model'])
+            st.write("### Results from Document:")
+            for i, result in enumerate(results):
+                st.write(f"**Result {i+1}:** {result}")
+            # Use Groq API for additional insights
+            chat_completion = client.chat.completions.create(
+                messa