Talha812 commited on
Commit
a580ec9
·
verified ·
1 Parent(s): 55c29b3

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +87 -0
app.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app.py
2
+
3
+ import os
4
+ import json
5
+ import faiss
6
+ import numpy as np
7
+ import PyPDF2
8
+ import requests
9
+ import streamlit as st
10
+ from groq import Groq
11
+
12
+ # Constants
13
+ PDF_URL = "https://drive.google.com/uc?export=download&id=1YWX-RYxgtcKO1QETnz1N3rboZUhRZwcH"
14
+ VECTOR_DIM = 768
15
+ CHUNK_SIZE = 512
16
+
17
+ # Function to download and extract text from the PDF
18
+ def extract_text_from_pdf(url):
19
+ response = requests.get(url)
20
+ with open("document.pdf", "wb") as f:
21
+ f.write(response.content)
22
+
23
+ with open("document.pdf", "rb") as f:
24
+ reader = PyPDF2.PdfReader(f)
25
+ text = "\n".join(page.extract_text() for page in reader.pages)
26
+ return text
27
+
28
+ # Function to split text into chunks
29
+ def create_chunks(text, chunk_size):
30
+ words = text.split()
31
+ chunks = [" ".join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
32
+ return chunks
33
+
34
+ # Function to create FAISS vector store
35
+ def create_faiss_index(chunks, vector_dim):
36
+ index = faiss.IndexFlatL2(vector_dim)
37
+ embeddings = np.random.rand(len(chunks), vector_dim).astype('float32') # Replace with real embeddings
38
+ index.add(embeddings)
39
+ return index, embeddings
40
+
41
+ # Initialize Groq API client
42
+ def get_groq_client():
43
+ return Groq(api_key="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")
44
+
45
+ # Query Groq model
46
+ def query_model(client, question):
47
+ chat_completion = client.chat.completions.create(
48
+ messages=[{"role": "user", "content": question}],
49
+ model="llama-3.3-70b-versatile",
50
+ )
51
+ return chat_completion.choices[0].message.content
52
+
53
+ # Streamlit app
54
+ def main():
55
+ st.title("RAG-Based Application")
56
+
57
+ # Step 1: Extract text from the document
58
+ st.header("Step 1: Extract Text")
59
+ if st.button("Extract Text from PDF"):
60
+ text = extract_text_from_pdf(PDF_URL)
61
+ st.session_state["text"] = text
62
+ st.success("Text extracted successfully!")
63
+
64
+ # Step 2: Chunk the text
65
+ st.header("Step 2: Create Chunks")
66
+ if "text" in st.session_state and st.button("Create Chunks"):
67
+ chunks = create_chunks(st.session_state["text"], CHUNK_SIZE)
68
+ st.session_state["chunks"] = chunks
69
+ st.success(f"Created {len(chunks)} chunks.")
70
+
71
+ # Step 3: Create FAISS index
72
+ st.header("Step 3: Create Vector Database")
73
+ if "chunks" in st.session_state and st.button("Create Vector Database"):
74
+ index, embeddings = create_faiss_index(st.session_state["chunks"], VECTOR_DIM)
75
+ st.session_state["index"] = index
76
+ st.success("FAISS vector database created.")
77
+
78
+ # Step 4: Ask a question
79
+ st.header("Step 4: Query the Model")
80
+ question = st.text_input("Ask a question about the document:")
81
+ if question and "index" in st.session_state:
82
+ client = get_groq_client()
83
+ answer = query_model(client, question)
84
+ st.write("Answer:", answer)
85
+
86
+ if __name__ == "__main__":
87
+ main()