makhdoomnaeem commited on
Commit
6d7222a
·
verified ·
1 Parent(s): a15bbb6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +85 -128
app.py CHANGED
@@ -1,141 +1,98 @@
1
  import os
2
- import pickle
3
- import re
4
  import streamlit as st
5
- from googleapiclient.discovery import build
6
- from google_auth_oauthlib.flow import InstalledAppFlow
7
- from sentence_transformers import SentenceTransformer
8
- import faiss
9
- from groq import Groq
10
-
11
- # Constants
12
- SCOPES = ['https://www.googleapis.com/auth/drive.readonly']
13
 
14
  # Initialize Groq Client
15
  GROQ_API_KEY = "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" # Replace with your Groq API key
16
  os.environ["GROQ_API_KEY"] = GROQ_API_KEY
17
  client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
18
 
19
- # Hardcoded Google Drive share link
20
- SHARE_LINK = "https://drive.google.com/drive/folders/1gVdnV1za1thrVnH4LALbDzPtcvKr0z4u?usp=drive_link"
21
-
22
- # Function to extract folder ID from share link
23
- def extract_folder_id(share_link):
24
- match = re.search(r"(?<=folders/)[^/?]+", share_link)
25
- if match:
26
- return match.group(0)
27
- else:
28
- st.error("Invalid Google Drive folder share link.")
29
- return None
30
-
31
- # Function to authenticate Google Drive
32
- def authenticate_drive():
33
- creds = None
34
- if os.path.exists("token.pickle"):
35
- with open("token.pickle", "rb") as token:
36
- creds = pickle.load(token)
37
- if not creds or not creds.valid:
38
- if creds and creds.expired and creds.refresh_token:
39
- creds.refresh(Request())
40
- else:
41
- flow = InstalledAppFlow.from_client_secrets_file("client_secrets.json", SCOPES)
42
- creds = flow.run_local_server(port=0)
43
- with open("token.pickle", "wb") as token:
44
- pickle.dump(creds, token)
45
- return build("drive", "v3", credentials=creds)
46
-
47
- # Function to load documents from Google Drive
48
- def load_documents(service, folder_id):
49
- documents = []
50
- results = service.files().list(
51
- q=f"'{folder_id}' in parents and trashed=false",
52
- fields="files(id, name, mimeType)"
53
- ).execute()
54
- files = results.get("files", [])
55
- for file in files:
56
- if file["mimeType"] == "application/pdf":
57
- request = service.files().get_media(fileId=file["id"])
58
- file_content = request.execute().decode("utf-8") # Assuming plain text PDF for simplicity
59
- documents.append({"id": file["id"], "name": file["name"], "text": file_content})
60
- return documents
61
-
62
- # Function to build FAISS index
63
- def build_faiss_index(documents, embedder):
64
- document_texts = [doc["text"] for doc in documents]
65
- embeddings = embedder.encode(document_texts, convert_to_tensor=False)
66
- dimension = embeddings[0].shape[0]
67
- index = faiss.IndexFlatL2(dimension)
68
- index.add(embeddings)
69
- return index, document_texts
70
 
71
  # Function to query Groq API
72
- def query_groq(query, context):
73
- chat_completion = client.chat.completions.create(
74
- messages=[
75
- {
76
- "role": "user",
77
- "content": f"Answer the following question based on these documents: {context} \n\nQuestion: {query}",
78
- }
 
 
 
 
79
  ],
80
- model="llama-3.3-70b-versatile",
81
- stream=False,
82
- )
83
- return chat_completion.choices[0].message.content
84
-
85
- # Streamlit UI
86
- st.title("Document Querying with RAG and Groq")
87
- st.write("Processing documents from a predefined Google Drive folder and answering your queries.")
88
-
89
- # Google Drive authentication
90
- service = authenticate_drive()
91
-
92
- # Extract folder ID from share link
93
- folder_id = extract_folder_id(SHARE_LINK)
94
- documents = []
95
 
96
- if folder_id:
97
- st.write("Fetching documents from Google Drive...")
98
  try:
99
- documents = load_documents(service, folder_id)
100
- st.success(f"Loaded {len(documents)} documents!")
101
- except Exception as e:
102
- st.error(f"Error fetching documents: {e}")
103
-
104
- # Build FAISS Index
105
- embedder = SentenceTransformer("all-MiniLM-L6-v2")
106
- index, document_texts = None, None
107
-
108
- if documents:
109
- st.write("Building FAISS index...")
110
- try:
111
- index, document_texts = build_faiss_index(documents, embedder)
112
- st.success("Index built successfully!")
113
- except Exception as e:
114
- st.error(f"Error building index: {e}")
115
-
116
- # Query the Documents
117
- query = st.text_input("Enter your question:")
118
- if query and index:
119
- st.write("Searching for relevant documents...")
120
- try:
121
- # Retrieve top-k relevant documents
122
- query_embedding = embedder.encode(query, convert_to_tensor=False)
123
- top_k = 3
124
- distances, indices = index.search([query_embedding], top_k)
125
- relevant_docs = [document_texts[idx] for idx in indices[0]]
126
- context = " ".join(relevant_docs)[:100000] # Truncate context for API compatibility
127
-
128
- # Display relevant document names
129
- st.write("Top relevant documents:")
130
- for idx in indices[0]:
131
- st.write(f"- {documents[idx]['name']}")
132
-
133
- # Query Groq API
134
- st.write("Querying Groq AI for the answer...")
135
- try:
136
- answer = query_groq(query, context)
137
- st.success(f"Answer: {answer}")
138
- except Exception as e:
139
- st.error(f"Error querying Groq API: {e}")
140
- except Exception as e:
141
- st.error(f"Error during query: {e}")
 
1
  import os
 
 
2
  import streamlit as st
3
+ import requests
4
+ from PyPDF2 import PdfReader
5
+ from langchain_community.vectorstores import FAISS
6
+ from langchain.embeddings.huggingface import HuggingFaceEmbeddings
7
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
 
 
 
8
 
9
  # Initialize Groq Client
10
  GROQ_API_KEY = "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" # Replace with your Groq API key
11
  os.environ["GROQ_API_KEY"] = GROQ_API_KEY
12
  client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
13
 
14
+ # Hardcoded Google Drive link
15
+ GOOGLE_DRIVE_LINK = "https://drive.google.com/drive/folders/1gVdnV1za1thrVnH4LALbDzPtcvKr0z4u?usp=drive_link"
16
+
17
+ # Function to download the PDF from Google Drive
18
+ def download_pdf():
19
+ file_id = GOOGLE_DRIVE_LINK.split("/d/")[1].split("/view")[0]
20
+ url = f"https://drive.google.com/uc?id={file_id}&export=download"
21
+ response = requests.get(url)
22
+ with open("document.pdf", "wb") as f:
23
+ f.write(response.content)
24
+ return "document.pdf"
25
+
26
+ # Function to extract text from PDF
27
+ def extract_text_from_pdf(pdf_file):
28
+ reader = PdfReader(pdf_file)
29
+ text = ""
30
+ for page in reader.pages:
31
+ text += page.extract_text()
32
+ return text
33
+
34
+ # Function to create FAISS vector database
35
+ def create_vector_db(text):
36
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
37
+ chunks = text_splitter.split_text(text)
38
+
39
+ # Use Hugging Face Embeddings
40
+ model_name = "all-MiniLM-L6-v2"
41
+ embeddings = HuggingFaceEmbeddings(model_name=model_name)
42
+ vector_db = FAISS.from_texts(chunks, embeddings)
43
+ return vector_db
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
 
45
  # Function to query Groq API
46
+ def query_groq_api(query, context, model="llama-3.3-70b-versatile"):
47
+ url = "https://api.groq.com/openai/v1/chat/completions"
48
+ headers = {
49
+ "Content-Type": "application/json",
50
+ "Authorization": f"Bearer {os.getenv('GROQ_API_KEY')}",
51
+ }
52
+ data = {
53
+ "model": model,
54
+ "messages": [
55
+ {"role": "system", "content": "You are an intelligent assistant."},
56
+ {"role": "user", "content": f"Context: {context}\nQuestion: {query}"}
57
  ],
58
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
 
 
 
60
  try:
61
+ response = requests.post(url, headers=headers, json=data)
62
+ response.raise_for_status() # Raise an error for bad responses
63
+ result = response.json()
64
+ return result.get("choices", [{}])[0].get("message", {}).get("content", "No response.")
65
+ except requests.exceptions.RequestException as e:
66
+ return f"Error: {e}"
67
+
68
+ # Streamlit App
69
+ st.title("PDF Q&A with Groq API")
70
+
71
+ # Persistent state to store vector database
72
+ if "vector_db" not in st.session_state:
73
+ st.session_state.vector_db = None
74
+
75
+ # Process the hardcoded PDF link
76
+ if st.button("Process PDF"):
77
+ st.info("Downloading and processing the PDF...")
78
+ pdf_file = download_pdf()
79
+ pdf_text = extract_text_from_pdf(pdf_file)
80
+ st.success("PDF processed successfully!")
81
+
82
+ # Create FAISS vector database
83
+ st.info("Creating vector database...")
84
+ st.session_state.vector_db = create_vector_db(pdf_text)
85
+ st.success("Vector database created!")
86
+
87
+ # Query the document
88
+ if st.session_state.vector_db:
89
+ user_query = st.text_input("Ask a question about the document:")
90
+ if st.button("Submit Query"):
91
+ with st.spinner("Processing your query..."):
92
+ # Retrieve similar text chunks
93
+ similar_docs = st.session_state.vector_db.similarity_search(user_query, k=3)
94
+ context = " ".join([doc.page_content for doc in similar_docs])
95
+
96
+ # Send query with context to Groq API
97
+ response = query_groq_api(user_query, context)
98
+ st.write("**Answer:**", response)