NHZ commited on
Commit
018761e
·
verified ·
1 Parent(s): c35c139

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +71 -91
app.py CHANGED
@@ -1,113 +1,93 @@
1
  import os
2
- import requests
3
  import streamlit as st
 
 
4
  import numpy as np
5
  import faiss
6
- from sentence_transformers import SentenceTransformer
7
  from groq import Groq
8
 
9
- # Function to download document from a public Google Drive link
10
- def download_file_from_public_link(url):
11
- file_id = url.split("/d/")[1].split("/")[0]
12
- download_url = f"https://drive.google.com/uc?id={file_id}&export=download"
13
- response = requests.get(download_url)
14
- if response.status_code == 200:
15
- return response.text
16
- else:
17
- raise Exception("Failed to download file from Google Drive.")
18
 
19
- # Function to preprocess text
20
- def preprocess_text(text, chunk_size=512):
21
- sentences = text.split(".")
22
- chunks = []
23
- current_chunk = []
24
- current_length = 0
25
-
26
- for sentence in sentences:
27
- sentence_length = len(sentence.split())
28
- if current_length + sentence_length > chunk_size:
29
- chunks.append(" ".join(current_chunk))
30
- current_chunk = []
31
- current_length = 0
32
- current_chunk.append(sentence)
33
- current_length += sentence_length
34
 
35
- if current_chunk:
36
- chunks.append(" ".join(current_chunk))
 
 
37
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
  return chunks
39
 
40
- # Function to create a FAISS index
 
 
 
 
 
 
 
 
 
 
41
  def create_faiss_index(embeddings):
42
  dimension = embeddings.shape[1]
43
  index = faiss.IndexFlatL2(dimension)
44
  index.add(embeddings)
45
  return index
46
 
47
- # Function to query FAISS index
48
- def query_faiss_index(index, query_embedding, top_k=5):
49
- distances, indices = index.search(query_embedding, top_k)
50
- return indices[0], distances[0]
 
 
 
51
 
52
- # Streamlit App
53
  def main():
54
- st.title("RAG-based Application")
55
-
56
- # Load Groq API Key from environment (set in Hugging Face secrets)
57
- groq_api_key = os.getenv("GROQ_API_KEY")
58
- if not groq_api_key:
59
- st.error("Groq API Key is missing. Ensure it is set as a secret in Hugging Face.")
60
- return
61
-
62
- # Predefined Google Drive link
63
- drive_link = "https://drive.google.com/file/d/1XvqA1OIssRs2gbmOtKFKj-02yQ5X2yg0/view?usp=sharing"
64
-
65
- if st.button("Load Document"):
66
- try:
67
- document_text = download_file_from_public_link(drive_link)
68
- st.success("Document downloaded successfully!")
69
-
70
- # Process the document
71
- chunks = preprocess_text(document_text)
72
- st.write(f"Document split into {len(chunks)} chunks.")
73
-
74
- # Embed chunks
75
- model = SentenceTransformer("all-MiniLM-L6-v2")
76
- embeddings = np.array([model.encode(chunk) for chunk in chunks])
77
-
78
- # Create FAISS index
79
- index = create_faiss_index(embeddings)
80
- st.success("FAISS index created.")
81
-
82
- # Save index and chunks
83
- st.session_state["index"] = index
84
- st.session_state["chunks"] = chunks
85
-
86
- except Exception as e:
87
- st.error(f"Failed to load document: {str(e)}")
88
-
89
- if "index" in st.session_state and "chunks" in st.session_state:
90
- query = st.text_input("Enter your query")
91
- if query:
92
- model = SentenceTransformer("all-MiniLM-L6-v2")
93
- query_embedding = model.encode([query])
94
- indices, distances = query_faiss_index(st.session_state["index"], query_embedding)
95
-
96
- # Display results
97
- st.write("Relevant Chunks:")
98
- for i, idx in enumerate(indices):
99
- st.write(f"Chunk {i + 1} (Distance: {distances[i]}):")
100
- st.write(st.session_state["chunks"][idx])
101
-
102
- # Query Groq API
103
- client = Groq(api_key=groq_api_key)
104
- chat_completion = client.chat.completions.create(
105
- messages=[{"role": "user", "content": query}],
106
- model="llama-3.3-70b-versatile",
107
- )
108
- st.write("Groq Model Response:")
109
- st.write(chat_completion.choices[0].message.content)
110
-
111
 
112
  if __name__ == "__main__":
113
  main()
 
1
  import os
 
2
  import streamlit as st
3
+ import PyPDF2
4
+ import requests
5
  import numpy as np
6
  import faiss
 
7
  from groq import Groq
8
 
9
+ # Initialize Groq client using the secret environment variable
10
+ client = Groq(api_key=os.getenv("GROQ_API_KEY"))
 
 
 
 
 
 
 
11
 
12
+ # Function to download and read PDF content
13
+ def extract_text_from_google_drive():
14
+ link = "https://drive.google.com/uc?id=1XvqA1OIssRs2gbmOtKFKj-02yQ5X2yg0"
15
+ response = requests.get(link)
16
+ with open("document.pdf", "wb") as file:
17
+ file.write(response.content)
 
 
 
 
 
 
 
 
 
18
 
19
+ with open("document.pdf", "rb") as file:
20
+ reader = PyPDF2.PdfReader(file)
21
+ text = " ".join([page.extract_text() for page in reader.pages])
22
+ return text
23
 
24
+ # Function to chunk text
25
+ def chunk_text(text, max_length=500):
26
+ sentences = text.split(". ")
27
+ chunks = []
28
+ chunk = ""
29
+ for sentence in sentences:
30
+ if len(chunk) + len(sentence) <= max_length:
31
+ chunk += sentence + ". "
32
+ else:
33
+ chunks.append(chunk.strip())
34
+ chunk = sentence + ". "
35
+ if chunk:
36
+ chunks.append(chunk.strip())
37
  return chunks
38
 
39
+ # Function to compute embeddings manually (dummy implementation for simplicity)
40
+ def compute_embeddings(chunks):
41
+ # Create simple embeddings based on the ASCII sum of characters in each chunk
42
+ embeddings = []
43
+ for chunk in chunks:
44
+ vector = np.array([ord(char) for char in chunk[:300]]) # Truncate to 300 characters
45
+ padded_vector = np.pad(vector, (0, 300 - len(vector)), mode="constant")
46
+ embeddings.append(padded_vector.astype(np.float32))
47
+ return np.array(embeddings)
48
+
49
+ # Function to create FAISS index
50
  def create_faiss_index(embeddings):
51
  dimension = embeddings.shape[1]
52
  index = faiss.IndexFlatL2(dimension)
53
  index.add(embeddings)
54
  return index
55
 
56
+ # Function to query Groq API
57
+ def query_groq(question, model_name="llama-3.3-70b-versatile"):
58
+ chat_completion = client.chat.completions.create(
59
+ messages=[{"role": "user", "content": question}],
60
+ model=model_name,
61
+ )
62
+ return chat_completion.choices[0].message.content
63
 
64
+ # Streamlit app
65
  def main():
66
+ st.title("RAG-based Application with Groq API")
67
+ st.subheader("Query the document stored on Google Drive")
68
+
69
+ st.write("Extracting text from the document...")
70
+ text = extract_text_from_google_drive()
71
+ st.write("Document text extracted successfully!")
72
+
73
+ st.write("Chunking and embedding text...")
74
+ chunks = chunk_text(text)
75
+ embeddings = compute_embeddings(chunks)
76
+ index = create_faiss_index(embeddings)
77
+ st.write(f"Created FAISS index with {len(chunks)} chunks.")
78
+
79
+ # Query input
80
+ question = st.text_input("Ask a question based on the document:")
81
+ if question:
82
+ st.write("Searching for relevant chunks...")
83
+ question_embedding = compute_embeddings([question])[0]
84
+ _, indices = index.search(np.array([question_embedding]), k=1)
85
+ relevant_chunk = chunks[indices[0][0]]
86
+
87
+ st.write("Generating answer using Groq API...")
88
+ answer = query_groq(relevant_chunk)
89
+ st.write("### Answer:")
90
+ st.write(answer)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
 
92
  if __name__ == "__main__":
93
  main()