SHAMIL SHAHBAZ AWAN commited on
Commit
1f8c160
·
verified ·
1 Parent(s): 755213a

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +83 -0
app.py ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import streamlit as st
3
+ from PyPDF2 import PdfReader
4
+ from sentence_transformers import SentenceTransformer
5
+ from transformers import pipeline
6
+ import faiss
7
+ import numpy as np
8
+ from groq import Client # Assuming Groq API client is installed
9
+
10
+ # Load Hugging Face Secrets
11
+ HUGGINGFACE_KEY = os.getenv("HF_API_TOKEN") # Set in Hugging Face Spaces secret manager
12
+ if not HUGGINGFACE_KEY:
13
+ st.error("Hugging Face API token not found. Please set it in the Hugging Face Secrets.")
14
+
15
+ # Initialize Groq client
16
+ groq_client = Client(api_key=HUGGINGFACE_KEY)
17
+
18
+ # Load models
19
+ embedder = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
20
+
21
+ # Paths
22
+ DOCUMENTS_FOLDER = "documents/"
23
+ VECTORSTORE_FOLDER = "vectorstore/"
24
+
25
+ # Initialize FAISS vector store
26
+ if not os.path.exists(VECTORSTORE_FOLDER):
27
+ os.makedirs(VECTORSTORE_FOLDER)
28
+
29
+ vectorstore_path = os.path.join(VECTORSTORE_FOLDER, "index.faiss")
30
+ if os.path.exists(vectorstore_path):
31
+ index = faiss.read_index(vectorstore_path)
32
+ else:
33
+ index = faiss.IndexFlatL2(embedder.get_sentence_embedding_dimension())
34
+
35
+ # Load and process documents
36
+ def load_documents(folder):
37
+ documents = []
38
+ for filename in os.listdir(folder):
39
+ if filename.endswith(".pdf"):
40
+ pdf_reader = PdfReader(os.path.join(folder, filename))
41
+ text = ""
42
+ for page in pdf_reader.pages:
43
+ text += page.extract_text()
44
+ documents.append(text)
45
+ return documents
46
+
47
+ def chunk_text(text, chunk_size=500, overlap=100):
48
+ chunks = []
49
+ for i in range(0, len(text), chunk_size - overlap):
50
+ chunks.append(text[i:i + chunk_size])
51
+ return chunks
52
+
53
+ if st.button("Process Documents"):
54
+ st.info("Processing documents...")
55
+ all_text = load_documents(DOCUMENTS_FOLDER)
56
+ chunks = []
57
+ for text in all_text:
58
+ chunks.extend(chunk_text(text))
59
+
60
+ embeddings = embedder.encode(chunks, show_progress_bar=True)
61
+ index.add(np.array(embeddings))
62
+ faiss.write_index(index, vectorstore_path)
63
+ st.success("Documents processed and vectorstore updated!")
64
+
65
+ # User interface
66
+ st.title("RAG Application with Streamlit")
67
+
68
+ user_query = st.text_input("Enter your query:")
69
+
70
+ if user_query:
71
+ query_embedding = embedder.encode([user_query])
72
+ distances, indices = index.search(np.array(query_embedding), k=5)
73
+ retrieved_chunks = [chunks[idx] for idx in indices[0]]
74
+
75
+ st.subheader("Retrieved Chunks")
76
+ for chunk in retrieved_chunks:
77
+ st.write(chunk)
78
+
79
+ combined_input = " ".join(retrieved_chunks) + user_query
80
+ response = groq_client.generate(model="llama-8b", prompt=combined_input, max_tokens=200)
81
+
82
+ st.subheader("Generated Response")
83
+ st.write(response["text"])