Engineer786 commited on
Commit
3342df5
·
verified ·
1 Parent(s): d8b1740

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +108 -0
  2. requirements.txt +8 -0
app.py ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import requests
3
+ from io import BytesIO
4
+ from PyPDF2 import PdfReader
5
+ from tempfile import NamedTemporaryFile
6
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
7
+ from langchain_community.embeddings import HuggingFaceEmbeddings
8
+ from langchain_community.vectorstores import FAISS
9
+ from groq import Groq
10
+ import streamlit as st
11
+
12
+ # Initialize Groq client
13
+ client = Groq(api_key=os.getenv("GROQ_API_KEY"))
14
+
15
+ # if client:
16
+ # st.write(f"API KEY FOUND!!!{client}")
17
+ # else:
18
+ # st.write("API KEY NOT FOUND")
19
+
20
+ # Predefined list of Google Drive links
21
+ drive_links = [
22
+ "https://drive.google.com/file/d/1JPf0XvDhn8QoDOlZDrxCOpu4WzKFESNz/view?usp=sharing"
23
+ ]
24
+
25
+ # Function to download PDF from Google Drive
26
+ def download_pdf_from_drive(drive_link):
27
+ file_id = drive_link.split('/d/')[1].split('/')[0]
28
+ download_url = f"https://drive.google.com/uc?id={file_id}&export=download"
29
+ response = requests.get(download_url)
30
+ if response.status_code == 200:
31
+ return BytesIO(response.content)
32
+ else:
33
+ raise Exception("Failed to download the PDF file from Google Drive.")
34
+
35
+ # Function to extract text from a PDF
36
+ def extract_text_from_pdf(pdf_stream):
37
+ pdf_reader = PdfReader(pdf_stream)
38
+ text = ""
39
+ for page in pdf_reader.pages:
40
+ text += page.extract_text()
41
+ return text
42
+
43
+ # Function to split text into chunks
44
+ def chunk_text(text, chunk_size=500, chunk_overlap=50):
45
+ text_splitter = RecursiveCharacterTextSplitter(
46
+ chunk_size=chunk_size, chunk_overlap=chunk_overlap
47
+ )
48
+ return text_splitter.split_text(text)
49
+
50
+ # Function to create embeddings and store them in FAISS
51
+ def create_embeddings_and_store(chunks):
52
+ embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
53
+ vector_db = FAISS.from_texts(chunks, embedding=embeddings)
54
+ return vector_db
55
+
56
+ # Function to query the vector database and interact with Groq
57
+ def query_vector_db(query, vector_db):
58
+ # Retrieve relevant documents
59
+ docs = vector_db.similarity_search(query, k=3)
60
+ context = "\n".join([doc.page_content for doc in docs])
61
+
62
+ # Interact with Groq API
63
+ chat_completion = client.chat.completions.create(
64
+ messages=[
65
+ {"role": "system", "content": f"Use the following context:\n{context}"},
66
+ {"role": "user", "content": query},
67
+ ],
68
+ model="llama3-8b-8192",
69
+ )
70
+ return chat_completion.choices[0].message.content
71
+
72
+ # Streamlit app
73
+ st.title("RAG-Based ChatBot (Already having Document)")
74
+
75
+ st.write("Processing the Data links...")
76
+
77
+ all_chunks = []
78
+
79
+ # Process each predefined Google Drive link
80
+ for link in drive_links:
81
+ try:
82
+ # st.write(f"Processing link: {link}")
83
+ # Download PDF
84
+ pdf_stream = download_pdf_from_drive(link)
85
+ # st.write("PDF Downloaded Successfully!")
86
+
87
+ # Extract text
88
+ text = extract_text_from_pdf(pdf_stream)
89
+ # st.write("PDF Text Extracted Successfully!")
90
+
91
+ # Chunk text
92
+ chunks = chunk_text(text)
93
+ # st.write(f"Created {len(chunks)} text chunks.")
94
+ all_chunks.extend(chunks)
95
+ except Exception as e:
96
+ st.write(f"Error processing link {link}: {e}")
97
+
98
+ if all_chunks:
99
+ # Generate embeddings and store in FAISS
100
+ vector_db = create_embeddings_and_store(all_chunks)
101
+ st.write("Data is Ready Successfully!")
102
+
103
+ # User query input
104
+ user_query = st.text_input("Enter your query:")
105
+ if user_query:
106
+ response = query_vector_db(user_query, vector_db)
107
+ st.write("Response from LLM:")
108
+ st.write(response)
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ requests
2
+ PyPDF2
3
+ streamlit
4
+ groq
5
+ langchain
6
+ langchain-community
7
+ faiss-gpu
8
+ sentence-transformers