khababakhtar commited on
Commit
0b21087
·
verified ·
1 Parent(s): 6c61daf

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +65 -59
app.py CHANGED
@@ -1,78 +1,84 @@
1
  import os
2
- import re
3
- import tempfile
4
- import pytesseract
5
- from pdf2image import convert_from_path
6
  import numpy as np
7
  import faiss
8
- from groq import Groq
 
9
  import requests
10
  import streamlit as st
 
11
 
12
- # Initialize Groq client
13
- groq_client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
14
 
15
- # Function to download and process Google Drive PDF
16
- def extract_text_from_pdf(download_url):
17
- response = requests.get(download_url)
18
- if response.status_code == 200:
19
- with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_pdf:
20
- temp_pdf.write(response.content)
21
- temp_pdf.close()
22
- images = convert_from_path(temp_pdf.name)
23
- text = ""
24
- for image in images:
25
- text += pytesseract.image_to_string(image)
26
- return text
27
- else:
28
- raise ValueError("Failed to download the PDF from the provided link.")
29
 
30
- # Preprocess text into chunks
31
- def preprocess_text(text, chunk_size=512):
32
- text = re.sub(r"\s+", " ", text)
33
  words = text.split()
34
- chunks = [" ".join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)]
35
  return chunks
36
 
37
- # Store chunks in FAISS
38
  def store_chunks_in_faiss(chunks):
39
- vector_dim = 768 # Assume embeddings are 768-dimensional
40
  index = faiss.IndexFlatL2(vector_dim)
41
- embeddings = np.random.rand(len(chunks), vector_dim).astype("float32") # Dummy embeddings
 
 
 
 
 
 
42
  index.add(embeddings)
43
  return index
44
 
45
- # Query Groq API
46
- def query_groq_model(prompt):
47
- chat_completion = groq_client.chat.completions.create(
48
- messages=[{"role": "user", "content": prompt}],
49
- model="llama-3.3-70b-versatile",
50
- )
51
- return chat_completion.choices[0].message.content
52
 
53
- # Streamlit frontend
54
- st.title("RAG-Based Application")
55
 
56
- drive_url = st.text_input("Enter Google Drive File URL:")
57
- query = st.text_input("Enter your query:")
58
- if st.button("Process"):
59
- if drive_url and query:
60
- try:
61
- # Extract file ID from Google Drive URL
62
- file_id = drive_url.split("/d/")[1].split("/")[0]
63
- download_url = f"https://drive.google.com/uc?id={file_id}&export=download"
64
-
65
- with st.spinner("Processing document..."):
66
- document_text = extract_text_from_pdf(download_url)
67
- chunks = preprocess_text(document_text)
68
- index = store_chunks_in_faiss(chunks)
69
- st.success("Document processed and stored in vector database.")
70
-
71
- with st.spinner("Querying model..."):
72
- response = query_groq_model(query)
73
- st.write("Model Response:")
74
- st.write(response)
75
- except Exception as e:
76
- st.error(f"An error occurred: {e}")
77
  else:
78
- st.error("Please provide both Google Drive File URL and query.")
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
 
 
 
 
2
  import numpy as np
3
  import faiss
4
+ import pytesseract
5
+ from pdf2image import convert_from_path
6
  import requests
7
  import streamlit as st
8
+ from groq import Groq
9
 
10
+ # Set up Groq client
11
+ client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
12
 
13
+ # Function to extract text from PDF
14
+ def extract_text_from_pdf(pdf_path):
15
+ images = convert_from_path(pdf_path)
16
+ text = ""
17
+ for page in images:
18
+ text += pytesseract.image_to_string(page)
19
+ return text
 
 
 
 
 
 
 
20
 
21
+ # Function to chunk the text
22
+ def create_chunks(text, chunk_size=200):
 
23
  words = text.split()
24
+ chunks = [" ".join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
25
  return chunks
26
 
27
+ # Function to store chunks in FAISS (GPU enabled)
28
  def store_chunks_in_faiss(chunks):
29
+ vector_dim = 768 # Assuming embeddings are 768-dimensional
30
  index = faiss.IndexFlatL2(vector_dim)
31
+
32
+ # Move index to GPU if available
33
+ res = faiss.StandardGpuResources()
34
+ index = faiss.index_cpu_to_gpu(res, 0, index)
35
+
36
+ # Generate dummy embeddings for demonstration
37
+ embeddings = np.random.rand(len(chunks), vector_dim).astype("float32")
38
  index.add(embeddings)
39
  return index
40
 
41
+ # Check if FAISS is using GPU
42
+ def is_gpu_available():
43
+ return faiss.get_num_gpus() > 0
44
+
45
+ # Streamlit app interface
46
+ st.title("PDF Content Chunking and Retrieval with FAISS-GPU")
 
47
 
48
+ # PDF upload
49
+ uploaded_file = st.file_uploader("Upload a PDF file", type=["pdf"])
50
 
51
+ if uploaded_file:
52
+ st.write("Processing the uploaded file...")
53
+ with open("uploaded_file.pdf", "wb") as f:
54
+ f.write(uploaded_file.getbuffer())
55
+
56
+ # Extract text
57
+ extracted_text = extract_text_from_pdf("uploaded_file.pdf")
58
+ st.text_area("Extracted Text", extracted_text, height=200)
59
+
60
+ # Chunk text
61
+ st.write("Creating chunks...")
62
+ chunks = create_chunks(extracted_text)
63
+ st.write(f"Total chunks created: {len(chunks)}")
64
+
65
+ # Store chunks in FAISS
66
+ st.write("Storing chunks in FAISS...")
67
+ index = store_chunks_in_faiss(chunks)
68
+
69
+ if is_gpu_available():
70
+ st.success("FAISS is using GPU resources!")
 
71
  else:
72
+ st.warning("FAISS is running on CPU.")
73
+
74
+ st.write("Chunks successfully stored in the FAISS index!")
75
+
76
+ # Interaction with Groq
77
+ user_input = st.text_input("Ask a question about the content:")
78
+ if user_input:
79
+ st.write("Sending query to Groq API...")
80
+ response = client.chat.completions.create(
81
+ messages=[{"role": "user", "content": user_input}],
82
+ model="llama-3.3-70b-versatile"
83
+ )
84
+ st.text_area("Groq API Response", response.choices[0].message.content, height=100)