Spaces:

NHZ
/

First_Aid_Kit

Sleeping

App Files Files Community

NHZ commited on Jan 4, 2025

Commit

d285555

verified ·

1 Parent(s): 2742de0

Update app.py

Browse files

Files changed (1) hide show

app.py +78 -89

app.py CHANGED Viewed

@@ -1,113 +1,102 @@
 import os
 import requests
-import PyPDF2
-import faiss
-import numpy as np
 import streamlit as st
-from transformers import AutoTokenizer, AutoModel
 from groq import Groq
-# Download file from Google Drive link
-def download_file_from_drive(url):
-    file_id = url.split("/d/")[1].split("/")[0]
-    download_url = f"https://drive.google.com/uc?id={file_id}&export=download"
     response = requests.get(download_url)
-    pdf_path = "document.pdf"
-    with open(pdf_path, "wb") as f:
         f.write(response.content)
-    return pdf_path
 # Extract text from PDF
-def extract_text_from_pdf(pdf_path):
-    with open(pdf_path, "rb") as f:
-        reader = PyPDF2.PdfReader(f)
-        text = " ".join(page.extract_text() for page in reader.pages)
     return text
-# Chunk text
 def chunk_text(text, chunk_size=500):
-    words = text.split()
-    chunks = [" ".join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
     return chunks
-# Generate embeddings
-def generate_embeddings(chunks, model_name="sentence-transformers/all-MiniLM-L6-v2"):
-    tokenizer = AutoTokenizer.from_pretrained(model_name)
-    model = AutoModel.from_pretrained(model_name)
-    embeddings = []
-    for chunk in chunks:
-        inputs = tokenizer(chunk, return_tensors="pt", padding=True, truncation=True)
-        outputs = model(**inputs)
-        embeddings.append(outputs.last_hidden_state.mean(dim=1).detach().numpy())
-    return np.vstack(embeddings)
-# Store embeddings in FAISS
-def create_faiss_index(embeddings):
     dimension = embeddings.shape[1]
     index = faiss.IndexFlatL2(dimension)
     index.add(embeddings)
-    return index
-# Groq API Integration
-def query_groq_api(query, api_key):
-    client = Groq(api_key=api_key)
-    chat_completion = client.chat.completions.create(
-        messages=[
-            {
-                "role": "user",
-                "content": query,
-            }
-        ],
-        model="llama-3.3-70b-versatile",
-    )
-    return chat_completion.choices[0].message.content
-# Streamlit App
 def main():
     st.title("RAG-based Application")
-    st.sidebar.title("Settings")
-    groq_api_key = st.sidebar.text_input("Enter your Groq API Key", type="password")
-    google_drive_url = st.sidebar.text_input("Enter Google Drive File Link")
-    if st.sidebar.button("Process Document"):
-        st.info("Downloading document...")
-        pdf_path = download_file_from_drive(google_drive_url)
-        st.success("Document downloaded successfully!")
-        st.info("Extracting text...")
-        text = extract_text_from_pdf(pdf_path)
-        st.success("Text extracted successfully!")
-        st.info("Chunking text...")
-        chunks = chunk_text(text)
-        st.success(f"Document chunked into {len(chunks)} chunks.")
-        st.info("Generating embeddings...")
-        embeddings = generate_embeddings(chunks)
-        st.success("Embeddings generated successfully!")
-        st.info("Creating FAISS index...")
-        index = create_faiss_index(embeddings)
-        st.success("FAISS index created successfully!")
-        st.session_state.index = index
-        st.session_state.chunks = chunks
-    if "index" in st.session_state:
-        query = st.text_input("Ask a question:")
-        if st.button("Search"):
-            st.info("Querying FAISS index...")
-            query_embeddings = generate_embeddings([query])
-            distances, indices = st.session_state.index.search(query_embeddings, k=5)
-            relevant_chunks = [st.session_state.chunks[i] for i in indices[0]]
-            st.success("Relevant chunks retrieved!")
-            st.info("Generating answer via Groq API...")
-            context = " ".join(relevant_chunks)
-            answer = query_groq_api(context + "\n" + query, api_key=groq_api_key)
-            st.success("Answer generated!")
-            st.write(answer)
 if __name__ == "__main__":
     main()

 import os
+import re
 import requests
+import pdfplumber
 import streamlit as st
+import faiss
+from sentence_transformers import SentenceTransformer
 from groq import Groq
+# Built-in Google Drive document link
+DOCUMENT_URL = "https://drive.google.com/file/d/1XvqA1OIssRs2gbmOtKFKj-02yQ5X2yg0/view?usp=sharing"
+# Function to download document from the Google Drive link
+def download_document(file_url):
+    file_id = file_url.split("/d/")[1].split("/")[0]
+    download_url = f"https://drive.google.com/uc?export=download&id={file_id}"
     response = requests.get(download_url)
+    output = "document.pdf"
+    with open(output, "wb") as f:
         f.write(response.content)
+    return output
 # Extract text from PDF
+def extract_text_from_pdf(file_path):
+    text = ""
+    with pdfplumber.open(file_path) as pdf:
+        for page in pdf.pages:
+            text += page.extract_text()
     return text
+# Chunk the text
 def chunk_text(text, chunk_size=500):
+    sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)
+    chunks, current_chunk = [], ""
+    for sentence in sentences:
+        if len(current_chunk) + len(sentence) < chunk_size:
+            current_chunk += sentence + " "
+        else:
+            chunks.append(current_chunk.strip())
+            current_chunk = sentence + " "
+    if current_chunk:
+        chunks.append(current_chunk.strip())
     return chunks
+# Vectorize and store in FAISS
+def create_faiss_index(chunks, model):
+    embeddings = model.encode(chunks)
     dimension = embeddings.shape[1]
     index = faiss.IndexFlatL2(dimension)
     index.add(embeddings)
+    return index, embeddings
+# Query FAISS index
+def query_faiss(query, index, chunks, model, k=5):
+    query_embedding = model.encode([query])
+    distances, indices = index.search(query_embedding, k)
+    return [chunks[i] for i in indices[0]]
+# Streamlit application
 def main():
     st.title("RAG-based Application")
+    st.write("Interacting with a knowledge base derived from the uploaded document.")
+    # Processing the document
+    st.write("Processing the pre-configured document...")
+    document_path = download_document(DOCUMENT_URL)
+    text = extract_text_from_pdf(document_path)
+    chunks = chunk_text(text)
+    # Load model for embeddings
+    st.write("Loading model and creating FAISS index...")
+    embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
+    index, embeddings = create_faiss_index(chunks, embedding_model)
+    st.success("Document processed and indexed!")
+    # Query the database
+    query = st.text_input("Enter your query")
+    if query:
+        results = query_faiss(query, index, chunks, embedding_model)
+        st.write("Top relevant chunks:")
+        for i, result in enumerate(results):
+            st.write(f"{i+1}. {result}")
+    # Groq API interaction
+    groq_api_key = os.environ.get("GROQ_API_KEY")  # Securely fetched from Hugging Face Secrets
+    if groq_api_key:
+        client = Groq(api_key=groq_api_key)
+        if query:
+            st.write("Fetching response from Groq API...")
+            chat_completion = client.chat.completions.create(
+                messages=[{"role": "user", "content": query}],
+                model="llama-3.3-70b-versatile"
+            )
+            st.write("Response:")
+            st.write(chat_completion.choices[0].message.content)
+    else:
+        st.error("Groq API key not configured in Hugging Face Secrets.")
 if __name__ == "__main__":
     main()