NHZ commited on
Commit
d285555
·
verified ·
1 Parent(s): 2742de0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +78 -89
app.py CHANGED
@@ -1,113 +1,102 @@
1
  import os
 
2
  import requests
3
- import PyPDF2
4
- import faiss
5
- import numpy as np
6
  import streamlit as st
7
- from transformers import AutoTokenizer, AutoModel
 
8
  from groq import Groq
9
 
10
- # Download file from Google Drive link
11
- def download_file_from_drive(url):
12
- file_id = url.split("/d/")[1].split("/")[0]
13
- download_url = f"https://drive.google.com/uc?id={file_id}&export=download"
 
 
 
14
  response = requests.get(download_url)
15
- pdf_path = "document.pdf"
16
- with open(pdf_path, "wb") as f:
17
  f.write(response.content)
18
- return pdf_path
19
 
20
  # Extract text from PDF
21
- def extract_text_from_pdf(pdf_path):
22
- with open(pdf_path, "rb") as f:
23
- reader = PyPDF2.PdfReader(f)
24
- text = " ".join(page.extract_text() for page in reader.pages)
 
25
  return text
26
 
27
- # Chunk text
28
  def chunk_text(text, chunk_size=500):
29
- words = text.split()
30
- chunks = [" ".join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
 
 
 
 
 
 
 
 
31
  return chunks
32
 
33
- # Generate embeddings
34
- def generate_embeddings(chunks, model_name="sentence-transformers/all-MiniLM-L6-v2"):
35
- tokenizer = AutoTokenizer.from_pretrained(model_name)
36
- model = AutoModel.from_pretrained(model_name)
37
- embeddings = []
38
- for chunk in chunks:
39
- inputs = tokenizer(chunk, return_tensors="pt", padding=True, truncation=True)
40
- outputs = model(**inputs)
41
- embeddings.append(outputs.last_hidden_state.mean(dim=1).detach().numpy())
42
- return np.vstack(embeddings)
43
-
44
- # Store embeddings in FAISS
45
- def create_faiss_index(embeddings):
46
  dimension = embeddings.shape[1]
47
  index = faiss.IndexFlatL2(dimension)
48
  index.add(embeddings)
49
- return index
50
 
51
- # Groq API Integration
52
- def query_groq_api(query, api_key):
53
- client = Groq(api_key=api_key)
54
- chat_completion = client.chat.completions.create(
55
- messages=[
56
- {
57
- "role": "user",
58
- "content": query,
59
- }
60
- ],
61
- model="llama-3.3-70b-versatile",
62
- )
63
- return chat_completion.choices[0].message.content
64
 
65
- # Streamlit App
66
  def main():
67
  st.title("RAG-based Application")
68
- st.sidebar.title("Settings")
69
-
70
- groq_api_key = st.sidebar.text_input("Enter your Groq API Key", type="password")
71
- google_drive_url = st.sidebar.text_input("Enter Google Drive File Link")
72
-
73
- if st.sidebar.button("Process Document"):
74
- st.info("Downloading document...")
75
- pdf_path = download_file_from_drive(google_drive_url)
76
- st.success("Document downloaded successfully!")
77
-
78
- st.info("Extracting text...")
79
- text = extract_text_from_pdf(pdf_path)
80
- st.success("Text extracted successfully!")
81
-
82
- st.info("Chunking text...")
83
- chunks = chunk_text(text)
84
- st.success(f"Document chunked into {len(chunks)} chunks.")
85
-
86
- st.info("Generating embeddings...")
87
- embeddings = generate_embeddings(chunks)
88
- st.success("Embeddings generated successfully!")
89
-
90
- st.info("Creating FAISS index...")
91
- index = create_faiss_index(embeddings)
92
- st.success("FAISS index created successfully!")
93
-
94
- st.session_state.index = index
95
- st.session_state.chunks = chunks
96
-
97
- if "index" in st.session_state:
98
- query = st.text_input("Ask a question:")
99
- if st.button("Search"):
100
- st.info("Querying FAISS index...")
101
- query_embeddings = generate_embeddings([query])
102
- distances, indices = st.session_state.index.search(query_embeddings, k=5)
103
- relevant_chunks = [st.session_state.chunks[i] for i in indices[0]]
104
- st.success("Relevant chunks retrieved!")
105
-
106
- st.info("Generating answer via Groq API...")
107
- context = " ".join(relevant_chunks)
108
- answer = query_groq_api(context + "\n" + query, api_key=groq_api_key)
109
- st.success("Answer generated!")
110
- st.write(answer)
111
 
112
  if __name__ == "__main__":
113
  main()
 
 
1
  import os
2
+ import re
3
  import requests
4
+ import pdfplumber
 
 
5
  import streamlit as st
6
+ import faiss
7
+ from sentence_transformers import SentenceTransformer
8
  from groq import Groq
9
 
10
+ # Built-in Google Drive document link
11
+ DOCUMENT_URL = "https://drive.google.com/file/d/1XvqA1OIssRs2gbmOtKFKj-02yQ5X2yg0/view?usp=sharing"
12
+
13
+ # Function to download document from the Google Drive link
14
+ def download_document(file_url):
15
+ file_id = file_url.split("/d/")[1].split("/")[0]
16
+ download_url = f"https://drive.google.com/uc?export=download&id={file_id}"
17
  response = requests.get(download_url)
18
+ output = "document.pdf"
19
+ with open(output, "wb") as f:
20
  f.write(response.content)
21
+ return output
22
 
23
  # Extract text from PDF
24
+ def extract_text_from_pdf(file_path):
25
+ text = ""
26
+ with pdfplumber.open(file_path) as pdf:
27
+ for page in pdf.pages:
28
+ text += page.extract_text()
29
  return text
30
 
31
+ # Chunk the text
32
  def chunk_text(text, chunk_size=500):
33
+ sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)
34
+ chunks, current_chunk = [], ""
35
+ for sentence in sentences:
36
+ if len(current_chunk) + len(sentence) < chunk_size:
37
+ current_chunk += sentence + " "
38
+ else:
39
+ chunks.append(current_chunk.strip())
40
+ current_chunk = sentence + " "
41
+ if current_chunk:
42
+ chunks.append(current_chunk.strip())
43
  return chunks
44
 
45
+ # Vectorize and store in FAISS
46
+ def create_faiss_index(chunks, model):
47
+ embeddings = model.encode(chunks)
 
 
 
 
 
 
 
 
 
 
48
  dimension = embeddings.shape[1]
49
  index = faiss.IndexFlatL2(dimension)
50
  index.add(embeddings)
51
+ return index, embeddings
52
 
53
+ # Query FAISS index
54
+ def query_faiss(query, index, chunks, model, k=5):
55
+ query_embedding = model.encode([query])
56
+ distances, indices = index.search(query_embedding, k)
57
+ return [chunks[i] for i in indices[0]]
 
 
 
 
 
 
 
 
58
 
59
+ # Streamlit application
60
  def main():
61
  st.title("RAG-based Application")
62
+ st.write("Interacting with a knowledge base derived from the uploaded document.")
63
+
64
+ # Processing the document
65
+ st.write("Processing the pre-configured document...")
66
+ document_path = download_document(DOCUMENT_URL)
67
+ text = extract_text_from_pdf(document_path)
68
+ chunks = chunk_text(text)
69
+
70
+ # Load model for embeddings
71
+ st.write("Loading model and creating FAISS index...")
72
+ embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
73
+ index, embeddings = create_faiss_index(chunks, embedding_model)
74
+ st.success("Document processed and indexed!")
75
+
76
+ # Query the database
77
+ query = st.text_input("Enter your query")
78
+ if query:
79
+ results = query_faiss(query, index, chunks, embedding_model)
80
+ st.write("Top relevant chunks:")
81
+ for i, result in enumerate(results):
82
+ st.write(f"{i+1}. {result}")
83
+
84
+ # Groq API interaction
85
+ groq_api_key = os.environ.get("GROQ_API_KEY") # Securely fetched from Hugging Face Secrets
86
+ if groq_api_key:
87
+ client = Groq(api_key=groq_api_key)
88
+
89
+ if query:
90
+ st.write("Fetching response from Groq API...")
91
+ chat_completion = client.chat.completions.create(
92
+ messages=[{"role": "user", "content": query}],
93
+ model="llama-3.3-70b-versatile"
94
+ )
95
+ st.write("Response:")
96
+ st.write(chat_completion.choices[0].message.content)
97
+ else:
98
+ st.error("Groq API key not configured in Hugging Face Secrets.")
 
 
 
 
 
 
99
 
100
  if __name__ == "__main__":
101
  main()
102
+