Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,5 +1,3 @@
|
|
| 1 |
-
import numpy as np # Import numpy first to avoid compatibility issues
|
| 2 |
-
import faiss
|
| 3 |
import os
|
| 4 |
import re
|
| 5 |
import requests
|
|
@@ -9,10 +7,11 @@ import faiss
|
|
| 9 |
from sentence_transformers import SentenceTransformer
|
| 10 |
from groq import Groq
|
| 11 |
|
| 12 |
-
#
|
| 13 |
DOCUMENT_URL = "https://drive.google.com/file/d/1XvqA1OIssRs2gbmOtKFKj-02yQ5X2yg0/view?usp=sharing"
|
|
|
|
| 14 |
|
| 15 |
-
# Function to download document
|
| 16 |
def download_document(file_url):
|
| 17 |
file_id = file_url.split("/d/")[1].split("/")[0]
|
| 18 |
download_url = f"https://drive.google.com/uc?export=download&id={file_id}"
|
|
@@ -30,8 +29,8 @@ def extract_text_from_pdf(file_path):
|
|
| 30 |
text += page.extract_text()
|
| 31 |
return text
|
| 32 |
|
| 33 |
-
# Chunk
|
| 34 |
-
def chunk_text(text, chunk_size=
|
| 35 |
sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)
|
| 36 |
chunks, current_chunk = [], ""
|
| 37 |
for sentence in sentences:
|
|
@@ -61,16 +60,16 @@ def query_faiss(query, index, chunks, model, k=5):
|
|
| 61 |
# Streamlit application
|
| 62 |
def main():
|
| 63 |
st.title("RAG-based Application")
|
| 64 |
-
st.write("
|
| 65 |
|
| 66 |
-
#
|
| 67 |
st.write("Processing the pre-configured document...")
|
| 68 |
document_path = download_document(DOCUMENT_URL)
|
| 69 |
text = extract_text_from_pdf(document_path)
|
| 70 |
chunks = chunk_text(text)
|
| 71 |
|
| 72 |
-
#
|
| 73 |
-
st.write("
|
| 74 |
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
|
| 75 |
index, embeddings = create_faiss_index(chunks, embedding_model)
|
| 76 |
st.success("Document processed and indexed!")
|
|
@@ -78,16 +77,16 @@ def main():
|
|
| 78 |
# Query the database
|
| 79 |
query = st.text_input("Enter your query")
|
| 80 |
if query:
|
|
|
|
| 81 |
results = query_faiss(query, index, chunks, embedding_model)
|
| 82 |
st.write("Top relevant chunks:")
|
| 83 |
for i, result in enumerate(results):
|
| 84 |
st.write(f"{i+1}. {result}")
|
| 85 |
|
| 86 |
# Groq API interaction
|
| 87 |
-
groq_api_key = os.environ.get("GROQ_API_KEY") #
|
| 88 |
if groq_api_key:
|
| 89 |
client = Groq(api_key=groq_api_key)
|
| 90 |
-
|
| 91 |
if query:
|
| 92 |
st.write("Fetching response from Groq API...")
|
| 93 |
chat_completion = client.chat.completions.create(
|
|
@@ -102,3 +101,4 @@ def main():
|
|
| 102 |
if __name__ == "__main__":
|
| 103 |
main()
|
| 104 |
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import os
|
| 2 |
import re
|
| 3 |
import requests
|
|
|
|
| 7 |
from sentence_transformers import SentenceTransformer
|
| 8 |
from groq import Groq
|
| 9 |
|
| 10 |
+
# Constants
|
| 11 |
DOCUMENT_URL = "https://drive.google.com/file/d/1XvqA1OIssRs2gbmOtKFKj-02yQ5X2yg0/view?usp=sharing"
|
| 12 |
+
CHUNK_SIZE = 500
|
| 13 |
|
| 14 |
+
# Function to download document
|
| 15 |
def download_document(file_url):
|
| 16 |
file_id = file_url.split("/d/")[1].split("/")[0]
|
| 17 |
download_url = f"https://drive.google.com/uc?export=download&id={file_id}"
|
|
|
|
| 29 |
text += page.extract_text()
|
| 30 |
return text
|
| 31 |
|
| 32 |
+
# Chunk text into smaller parts
|
| 33 |
+
def chunk_text(text, chunk_size=CHUNK_SIZE):
|
| 34 |
sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)
|
| 35 |
chunks, current_chunk = [], ""
|
| 36 |
for sentence in sentences:
|
|
|
|
| 60 |
# Streamlit application
|
| 61 |
def main():
|
| 62 |
st.title("RAG-based Application")
|
| 63 |
+
st.write("This application uses a pre-configured document as the dataset for query responses.")
|
| 64 |
|
| 65 |
+
# Download and process the document
|
| 66 |
st.write("Processing the pre-configured document...")
|
| 67 |
document_path = download_document(DOCUMENT_URL)
|
| 68 |
text = extract_text_from_pdf(document_path)
|
| 69 |
chunks = chunk_text(text)
|
| 70 |
|
| 71 |
+
# Create FAISS index
|
| 72 |
+
st.write("Creating FAISS index...")
|
| 73 |
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
|
| 74 |
index, embeddings = create_faiss_index(chunks, embedding_model)
|
| 75 |
st.success("Document processed and indexed!")
|
|
|
|
| 77 |
# Query the database
|
| 78 |
query = st.text_input("Enter your query")
|
| 79 |
if query:
|
| 80 |
+
st.write("Fetching relevant content from the document...")
|
| 81 |
results = query_faiss(query, index, chunks, embedding_model)
|
| 82 |
st.write("Top relevant chunks:")
|
| 83 |
for i, result in enumerate(results):
|
| 84 |
st.write(f"{i+1}. {result}")
|
| 85 |
|
| 86 |
# Groq API interaction
|
| 87 |
+
groq_api_key = os.environ.get("GROQ_API_KEY") # Fetch API key from Hugging Face Secrets
|
| 88 |
if groq_api_key:
|
| 89 |
client = Groq(api_key=groq_api_key)
|
|
|
|
| 90 |
if query:
|
| 91 |
st.write("Fetching response from Groq API...")
|
| 92 |
chat_completion = client.chat.completions.create(
|
|
|
|
| 101 |
if __name__ == "__main__":
|
| 102 |
main()
|
| 103 |
|
| 104 |
+
|