app-rag / app.py
muhammadshaheryar's picture
Update app.py
98f5d0f verified
import fitz # PyMuPDF for PDF handling
from transformers import AutoTokenizer, AutoModel
import faiss
import torch
import streamlit as st
# Define model name
model_name = "bert-base-uncased" # Replace with your desired model
# Initialize tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
# Load model and tokenizer for embedding
model_name = "sentence-transformers/all-MiniLM-L6-v2" # Efficient model for embeddings
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
# Initialize FAISS index for efficient similarity search
embedding_dim = 384 # Dimension of MiniLM embeddings
index = faiss.IndexFlatL2(embedding_dim)
document_chunks = []
chunk_mappings = []
def embed_text(text):
"""Generate embeddings for a text chunk."""
inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
with torch.no_grad():
embeddings = model(**inputs).last_hidden_state.mean(dim=1)
return embeddings.numpy()
def extract_text_from_pdf(file_path):
"""Extract text from a PDF file."""
text = ""
with fitz.open(file_path) as pdf:
for page in pdf:
text += page.get_text("text")
return text
def chunk_text(text, chunk_size=500):
"""Divide the text into manageable chunks."""
return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
def index_pdf(file_path):
"""Process a PDF file, create embeddings, and store them in FAISS index."""
text = extract_text_from_pdf(file_path)
chunks = chunk_text(text)
for i, chunk in enumerate(chunks):
chunk_embedding = embed_text(chunk)
index.add(chunk_embedding) # Add to FAISS index
document_chunks.append(chunk)
chunk_mappings.append((file_path, i)) # Track chunk-to-file mappings
print(f"Indexed {len(chunks)} chunks from {file_path}")
def search(query, top_k=5):
"""Search for relevant document chunks based on query."""
query_embedding = embed_text(query)
distances, indices = index.search(query_embedding, top_k)
results = []
for dist, idx in zip(distances[0], indices[0]):
file_path, chunk_idx = chunk_mappings[idx]
results.append({"file": file_path, "text": document_chunks[idx], "distance": dist})
return results
# Streamlit interface
st.title("RAG PDF Search System")
# Upload PDF files
uploaded_files = st.file_uploader("Upload PDF files", type="pdf", accept_multiple_files=True)
if uploaded_files:
for uploaded_file in uploaded_files:
file_path = f"temp_{uploaded_file.name}"
with open(file_path, "wb") as f:
f.write(uploaded_file.getbuffer())
index_pdf(file_path)
# Query input
query = st.text_input("Enter your search query:")
if query:
results = search(query)
for result in results:
st.write(f"**File:** {result['file']}")
st.write(result["text"])
st.write(f"**Relevance Score:** {result['distance']}\n")
# Install necessary packages
pip install transformers faiss-cpu PyMuPDF streamlit pyngrok
# Write the Streamlit app code to a file named "app.py"
app_code = """
import fitz # PyMuPDF for PDF handling
from transformers import AutoTokenizer, AutoModel
import faiss
import torch
import streamlit as st
# Load model and tokenizer for embedding
model_name = "sentence-transformers/all-MiniLM-L6-v2" # Efficient model for embeddings
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
# Initialize FAISS index for similarity search
embedding_dim = 384 # Dimension of MiniLM embeddings
index = faiss.IndexFlatL2(embedding_dim)
document_chunks = []
chunk_mappings = []
def embed_text(text):
\"\"\"Generate embeddings for a text chunk.\"\"\"
inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
with torch.no_grad():
embeddings = model(**inputs).last_hidden_state.mean(dim=1)
return embeddings.numpy()
def extract_text_from_pdf(file_path):
\"\"\"Extract text from a PDF file.\"\"\"
text = ""
with fitz.open(file_path) as pdf:
for page in pdf:
text += page.get_text("text")
return text
def chunk_text(text, chunk_size=500):
\"\"\"Divide the text into manageable chunks.\"\"\"
return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
def index_pdf(file_path):
\"\"\"Process a PDF file, create embeddings, and store them in FAISS index.\"\"\"
text = extract_text_from_pdf(file_path)
chunks = chunk_text(text)
for i, chunk in enumerate(chunks):
chunk_embedding = embed_text(chunk)
index.add(chunk_embedding) # Add to FAISS index
document_chunks.append(chunk)
chunk_mappings.append((file_path, i)) # Track chunk-to-file mappings
print(f"Indexed {len(chunks)} chunks from {file_path}")
def search(query, top_k=5):
\"\"\"Search for relevant document chunks based on query.\"\"\"
query_embedding = embed_text(query)
distances, indices = index.search(query_embedding, top_k)
results = []
for dist, idx in zip(distances[0], indices[0]):
file_path, chunk_idx = chunk_mappings[idx]
results.append({"file": file_path, "text": document_chunks[idx], "distance": dist})
return results
# Streamlit interface
st.title("RAG PDF Search System")
# Upload PDF files
uploaded_files = st.file_uploader("Upload PDF files", type="pdf", accept_multiple_files=True)
if uploaded_files:
for uploaded_file in uploaded_files:
file_path = f"temp_{uploaded_file.name}"
with open(file_path, "wb") as f:
f.write(uploaded_file.getbuffer())
index_pdf(file_path)
# Query input
query = st.text_input("Enter your search query:")
if query:
results = search(query)
for result in results:
st.write(f"**File:** {result['file']}")
st.write(result["text"])
st.write(f"**Relevance Score:** {result['distance']}\n")
"""
# Save the code to app.py
with open("app.py", "w") as file:
file.write(app_code)
# Set up and start Ngrok for public access
from pyngrok import ngrok
ngrok.set_auth_token("YOUR_NGROK_AUTH_TOKEN") # Replace with your Ngrok auth token
public_url = ngrok.connect(port="8501")
print(f"Streamlit app is running at: {public_url}")
# Run the Streamlit app
!streamlit run app.py --server.port 8501