Engineer786 commited on
Commit
fc136bf
·
verified ·
1 Parent(s): 1bbad9b

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +96 -0
  2. requirements.txt +6 -0
app.py ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app.py
2
+
3
+ import os
4
+ import json
5
+ import faiss
6
+ import numpy as np
7
+ import PyPDF2
8
+ import requests
9
+ import streamlit as st
10
+ from groq import Groq
11
+
12
+ # Constants
13
+ PDF_URL = "https://drive.google.com/uc?export=download&id=1YWX-RYxgtcKO1QETnz1N3rboZUhRZwcH"
14
+ VECTOR_DIM = 768
15
+ CHUNK_SIZE = 512
16
+
17
+ # Function to download and extract text from the PDF
18
+ def extract_text_from_pdf(url):
19
+ response = requests.get(url)
20
+ with open("document.pdf", "wb") as f:
21
+ f.write(response.content)
22
+
23
+ with open("document.pdf", "rb") as f:
24
+ reader = PyPDF2.PdfReader(f)
25
+ text = "\n".join(page.extract_text() for page in reader.pages)
26
+ return text
27
+
28
+ # Function to split text into chunks
29
+ def create_chunks(text, chunk_size):
30
+ words = text.split()
31
+ chunks = [" ".join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
32
+ return chunks
33
+
34
+ # Function to create FAISS vector store
35
+ def create_faiss_index(chunks, vector_dim):
36
+ # Check if GPU is available and use it
37
+ if faiss.get_num_gpus() > 0:
38
+ st.write("Using GPU for FAISS indexing.")
39
+ resource = faiss.StandardGpuResources() # Initialize GPU resources
40
+ index_flat = faiss.IndexFlatL2(vector_dim)
41
+ index = faiss.index_cpu_to_gpu(resource, 0, index_flat)
42
+ else:
43
+ st.write("Using CPU for FAISS indexing.")
44
+ index = faiss.IndexFlatL2(vector_dim)
45
+
46
+ embeddings = np.random.rand(len(chunks), vector_dim).astype('float32') # Replace with real embeddings
47
+ index.add(embeddings)
48
+ return index, embeddings
49
+
50
+ # Initialize Groq API client
51
+ def get_groq_client():
52
+ return Groq(api_key=os.environ.get("GROQ_API_KEY"))
53
+
54
+ # Query Groq model
55
+ def query_model(client, question):
56
+ chat_completion = client.chat.completions.create(
57
+ messages=[{"role": "user", "content": question}],
58
+ model="llama-3.3-70b-versatile",
59
+ )
60
+ return chat_completion.choices[0].message.content
61
+
62
+ # Streamlit app
63
+ def main():
64
+ st.title("RAG-Based Application")
65
+
66
+ # Step 1: Extract text from the document
67
+ st.header("Step 1: Extract Text")
68
+ if st.button("Extract Text from PDF"):
69
+ text = extract_text_from_pdf(PDF_URL)
70
+ st.session_state["text"] = text
71
+ st.success("Text extracted successfully!")
72
+
73
+ # Step 2: Chunk the text
74
+ st.header("Step 2: Create Chunks")
75
+ if "text" in st.session_state and st.button("Create Chunks"):
76
+ chunks = create_chunks(st.session_state["text"], CHUNK_SIZE)
77
+ st.session_state["chunks"] = chunks
78
+ st.success(f"Created {len(chunks)} chunks.")
79
+
80
+ # Step 3: Create FAISS index
81
+ st.header("Step 3: Create Vector Database")
82
+ if "chunks" in st.session_state and st.button("Create Vector Database"):
83
+ index, embeddings = create_faiss_index(st.session_state["chunks"], VECTOR_DIM)
84
+ st.session_state["index"] = index
85
+ st.success("FAISS vector database created.")
86
+
87
+ # Step 4: Ask a question
88
+ st.header("Step 4: Query the Model")
89
+ question = st.text_input("Ask a question about the document:")
90
+ if question and "index" in st.session_state:
91
+ client = get_groq_client()
92
+ answer = query_model(client, question)
93
+ st.write("Answer:", answer)
94
+
95
+ if __name__ == "__main__":
96
+ main()
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ streamlit
2
+ requests
3
+ PyPDF2
4
+ numpy==1.23.5
5
+ faiss-gpu
6
+ groq