Spaces:

NHZ
/

First_Aid_Kit

Sleeping

App Files Files Community

NHZ commited on Jan 4, 2025

Commit

644455e

verified ·

1 Parent(s): acd8ea0

Update app.py

Browse files

Files changed (1) hide show

app.py +93 -66

app.py CHANGED Viewed

@@ -1,86 +1,113 @@
 import os
-import re
 import requests
-import pdfplumber
 import streamlit as st
 import faiss
 from sentence_transformers import SentenceTransformer
-# Constants
-DOCUMENT_URL = "https://drive.google.com/file/d/1XvqA1OIssRs2gbmOtKFKj-02yQ5X2yg0/view?usp=sharing"
-CHUNK_SIZE = 500
-# Function to download document
-def download_document(file_url):
-    file_id = file_url.split("/d/")[1].split("/")[0]
-    download_url = f"https://drive.google.com/uc?export=download&id={file_id}"
     response = requests.get(download_url)
-    output = "document.pdf"
-    with open(output, "wb") as f:
-        f.write(response.content)
-    return output
-# Extract text from PDF
-def extract_text_from_pdf(file_path):
-    text = ""
-    with pdfplumber.open(file_path) as pdf:
-        for page in pdf.pages:
-            text += page.extract_text()
-    return text
-# Chunk text into smaller parts
-def chunk_text(text, chunk_size=CHUNK_SIZE):
-    sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)
-    chunks, current_chunk = [], ""
     for sentence in sentences:
-        if len(current_chunk) + len(sentence) < chunk_size:
-            current_chunk += sentence + " "
-        else:
-            chunks.append(current_chunk.strip())
-            current_chunk = sentence + " "
     if current_chunk:
-        chunks.append(current_chunk.strip())
     return chunks
-# Vectorize and store in FAISS
-def create_faiss_index(chunks, model):
-    embeddings = model.encode(chunks)
     dimension = embeddings.shape[1]
     index = faiss.IndexFlatL2(dimension)
     index.add(embeddings)
-    return index, embeddings
-# Query FAISS index
-def query_faiss(query, index, chunks, model, k=5):
-    query_embedding = model.encode([query])
-    distances, indices = index.search(query_embedding, k)
-    return [chunks[i] for i in indices[0]]
-# Streamlit application
 def main():
-    st.title("Document-Based Query Application")
-    st.write("This application uses a pre-configured document as the dataset for answering queries.")
-    # Download and process the document
-    st.write("Processing the pre-configured document...")
-    document_path = download_document(DOCUMENT_URL)
-    text = extract_text_from_pdf(document_path)
-    chunks = chunk_text(text)
-    # Create FAISS index
-    st.write("Creating FAISS index...")
-    embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
-    index, embeddings = create_faiss_index(chunks, embedding_model)
-    st.success("Document processed and indexed!")
-    # Query the database
-    query = st.text_input("Enter your query")
-    if query:
-        st.write("Fetching relevant content from the document...")
-        results = query_faiss(query, index, chunks, embedding_model)
-        st.write("Top relevant chunks:")
-        for i, result in enumerate(results):
-            st.write(f"{i+1}. {result}")
 if __name__ == "__main__":
     main()

 import os
 import requests
 import streamlit as st
+import numpy as np
 import faiss
 from sentence_transformers import SentenceTransformer
+from groq import Groq
+# Function to download document from a public Google Drive link
+def download_file_from_public_link(url):
+    file_id = url.split("/d/")[1].split("/")[0]
+    download_url = f"https://drive.google.com/uc?id={file_id}&export=download"
     response = requests.get(download_url)
+    if response.status_code == 200:
+        return response.text
+    else:
+        raise Exception("Failed to download file from Google Drive.")
+# Function to preprocess text
+def preprocess_text(text, chunk_size=512):
+    sentences = text.split(".")
+    chunks = []
+    current_chunk = []
+    current_length = 0
     for sentence in sentences:
+        sentence_length = len(sentence.split())
+        if current_length + sentence_length > chunk_size:
+            chunks.append(" ".join(current_chunk))
+            current_chunk = []
+            current_length = 0
+        current_chunk.append(sentence)
+        current_length += sentence_length
     if current_chunk:
+        chunks.append(" ".join(current_chunk))
     return chunks
+# Function to create a FAISS index
+def create_faiss_index(embeddings):
     dimension = embeddings.shape[1]
     index = faiss.IndexFlatL2(dimension)
     index.add(embeddings)
+    return index
+# Function to query FAISS index
+def query_faiss_index(index, query_embedding, top_k=5):
+    distances, indices = index.search(query_embedding, top_k)
+    return indices[0], distances[0]
+# Streamlit App
 def main():
+    st.title("RAG-based Application")
+    # Load Groq API Key from environment (set in Hugging Face secrets)
+    groq_api_key = os.getenv("GROQ_API_KEY")
+    if not groq_api_key:
+        st.error("Groq API Key is missing. Ensure it is set as a secret in Hugging Face.")
+        return
+    # Predefined Google Drive link
+    drive_link = "https://drive.google.com/file/d/1XvqA1OIssRs2gbmOtKFKj-02yQ5X2yg0/view?usp=sharing"
+    if st.button("Load Document"):
+        try:
+            document_text = download_file_from_public_link(drive_link)
+            st.success("Document downloaded successfully!")
+            # Process the document
+            chunks = preprocess_text(document_text)
+            st.write(f"Document split into {len(chunks)} chunks.")
+            # Embed chunks
+            model = SentenceTransformer("all-MiniLM-L6-v2")
+            embeddings = np.array([model.encode(chunk) for chunk in chunks])
+            # Create FAISS index
+            index = create_faiss_index(embeddings)
+            st.success("FAISS index created.")
+            # Save index and chunks
+            st.session_state["index"] = index
+            st.session_state["chunks"] = chunks
+        except Exception as e:
+            st.error(f"Failed to load document: {str(e)}")
+    if "index" in st.session_state and "chunks" in st.session_state:
+        query = st.text_input("Enter your query")
+        if query:
+            model = SentenceTransformer("all-MiniLM-L6-v2")
+            query_embedding = model.encode([query])
+            indices, distances = query_faiss_index(st.session_state["index"], query_embedding)
+            # Display results
+            st.write("Relevant Chunks:")
+            for i, idx in enumerate(indices):
+                st.write(f"Chunk {i + 1} (Distance: {distances[i]}):")
+                st.write(st.session_state["chunks"][idx])
+            # Query Groq API
+            client = Groq(api_key=groq_api_key)
+            chat_completion = client.chat.completions.create(
+                messages=[{"role": "user", "content": query}],
+                model="llama-3.3-70b-versatile",
+            )
+            st.write("Groq Model Response:")
+            st.write(chat_completion.choices[0].message.content)
 if __name__ == "__main__":
     main()