SumbalFatima1122 commited on
Commit
37f721f
·
verified ·
1 Parent(s): 7b39b9b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +81 -93
app.py CHANGED
@@ -1,103 +1,91 @@
1
- # Install necessary libraries
 
2
 
3
-
4
- # Set up API key
 
 
 
 
 
5
  import os
6
- os.environ['GROQ_API_KEY'] = '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'
7
 
8
- # Download and process PDFs from public Google Drive links
9
- import requests
10
- import pdfplumber
11
 
12
- def download_from_drive(link):
 
13
  file_id = link.split('/d/')[1].split('/')[0]
14
- download_url = f"https://drive.google.com/uc?id={file_id}&export=download"
15
  response = requests.get(download_url)
16
  if response.status_code == 200:
17
- file_path = f"{file_id}.pdf"
18
- with open(file_path, 'wb') as f:
19
- f.write(response.content)
20
- print(f"PDF downloaded successfully: {file_path}")
21
- return file_path
22
  else:
23
- raise Exception("Failed to download file. Please check the link.")
24
-
25
- def extract_text_from_pdf(file_path):
26
- try:
27
- with pdfplumber.open(file_path) as pdf:
28
- text = ''.join(page.extract_text() for page in pdf.pages)
29
- print(f"Extracted text length: {len(text)}")
30
- return text
31
- except Exception as e:
32
- print(f"Error extracting text: {e}")
33
- return ""
34
-
35
- # Preprocess documents into chunks
36
- from langchain.text_splitter import CharacterTextSplitter
37
-
38
- def preprocess_document(content):
39
- text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
40
- return text_splitter.split_text(content)
41
-
42
- # Generate embeddings
43
- from sentence_transformers import SentenceTransformer
44
-
45
- def generate_embeddings(text_chunks):
46
- model = SentenceTransformer('all-MiniLM-L6-v2')
47
- return [model.encode(chunk) for chunk in text_chunks]
48
-
49
- # Store embeddings in FAISS
50
- import faiss
51
- import numpy as np
52
-
53
- def create_faiss_index(embeddings):
54
- dimension = len(embeddings[0])
55
- index = faiss.IndexFlatL2(dimension)
56
- index.add(np.array(embeddings))
57
- faiss.write_index(index, "faiss_index.index")
58
- print("Embeddings stored in FAISS.")
59
-
60
- # Query the Groq model
61
- from groq import Groq
62
-
63
- client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
64
-
65
- def query_model(prompt):
66
- response = client.chat.completions.create(
67
- messages=[{"role": "user", "content": prompt}],
68
- model="llama3-8b-8192"
69
  )
70
- return response.choices[0].message.content
71
-
72
- # Streamlit Frontend
73
- import streamlit as st
74
-
75
- st.title("RAG Application with Google Drive Documents")
76
-
77
- doc_links = [
78
- "https://drive.google.com/file/d/1zoo4-GNIGPtbT_Yb4nIZw-qYf8Wj57nP/view?usp=sharing"
79
- ]
80
-
81
- query = st.text_input("Enter your query:")
82
-
83
- if query:
84
- all_chunks = []
85
- for link in doc_links:
86
- try:
87
- file_path = download_from_drive(link)
88
- extracted_text = extract_text_from_pdf(file_path)
89
- if extracted_text.strip():
90
- text_chunks = preprocess_document(extracted_text)
91
- embeddings = generate_embeddings(text_chunks)
92
- create_faiss_index(embeddings)
93
- all_chunks.extend(text_chunks)
94
- else:
95
- st.error("Failed to extract text from the document.")
96
- except Exception as e:
97
- st.error(f"Error processing document: {e}")
98
-
99
- if all_chunks:
100
- result = query_model(query)
101
- st.write(result)
102
  else:
103
- st.error("No valid data to process. Please check your document links.")
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Required module installations (uncomment and run in your environment if needed)
2
+ # !pip install requests PyPDF2 langchain faiss-cpu streamlit groq sentence-transformers
3
 
4
+ import requests
5
+ import io
6
+ import PyPDF2
7
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
8
+ from langchain.embeddings import HuggingFaceEmbeddings # Open-source embedding model
9
+ from langchain.vectorstores import FAISS
10
+ from groq import Groq
11
  import os
12
+ import streamlit as st
13
 
14
+ # Set up Groq API
15
+ os.environ["GROQ_API_KEY"] = "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" # Replace with your Groq API key
16
+ client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
17
 
18
+ # Function to download PDF from Google Drive link
19
+ def download_pdf_from_link(link):
20
  file_id = link.split('/d/')[1].split('/')[0]
21
+ download_url = f"https://drive.google.com/uc?export=download&id={file_id}"
22
  response = requests.get(download_url)
23
  if response.status_code == 200:
24
+ return response.content
 
 
 
 
25
  else:
26
+ raise Exception("Failed to download file. Check the link.")
27
+
28
+ # Function to extract text from PDF
29
+ def read_pdf(pdf_content):
30
+ file_io = io.BytesIO(pdf_content)
31
+ pdf_reader = PyPDF2.PdfReader(file_io)
32
+ text = ""
33
+ for page in pdf_reader.pages:
34
+ text += page.extract_text()
35
+ return text
36
+
37
+ # Function to create chunks of text
38
+ def create_chunks(documents):
39
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
40
+ chunks = []
41
+ for doc in documents:
42
+ chunks.extend(text_splitter.split_text(doc))
43
+ return chunks
44
+
45
+ # Function to query the Groq API with vectorstore
46
+ def query_with_groq(query, vectorstore):
47
+ docs = vectorstore.similarity_search(query, k=3)
48
+ context = " ".join([doc.page_content for doc in docs])
49
+
50
+ chat_completion = client.chat.completions.create(
51
+ messages=[
52
+ {"role": "system", "content": "You are a helpful assistant."},
53
+ {"role": "user", "content": f"{context}\n\n{query}"}
54
+ ],
55
+ model="llama3-8b-8192",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
  )
57
+ return chat_completion.choices[0].message.content
58
+
59
+ # Main function to initialize the app
60
+ def main():
61
+ st.title("RAG Application with Google Drive Links")
62
+
63
+ # Input links (replace these with your document links)
64
+ links = [
65
+ "https://drive.google.com/file/d/1zoo4-GNIGPtbT_Yb4nIZw-qYf8Wj57nP/view?usp=sharing"
66
+ # Add more links here if needed
67
+ ]
68
+
69
+ # Load or process documents
70
+ if "vectorstore" not in st.session_state:
71
+ documents = [read_pdf(download_pdf_from_link(link)) for link in links]
72
+ chunks = create_chunks(documents)
73
+
74
+ # Generate embeddings and store in FAISS
75
+ embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
76
+ vectorstore = FAISS.from_texts(chunks, embeddings)
77
+ vectorstore.save_local("faiss_index")
78
+ st.session_state.vectorstore = vectorstore
 
 
 
 
 
 
 
 
 
 
79
  else:
80
+ vectorstore = st.session_state.vectorstore
81
+
82
+ # Query input from user
83
+ query = st.text_input("Enter your query:")
84
+ if query:
85
+ response = query_with_groq(query, vectorstore)
86
+ st.write("Response:")
87
+ st.write(response)
88
+
89
+ # Run the app
90
+ if __name__ == "__main__":
91
+ main()