import fitz # PyMuPDF for PDF handling from transformers import AutoTokenizer, AutoModel import faiss import torch import streamlit as st # Define model name model_name = "bert-base-uncased" # Replace with your desired model # Initialize tokenizer and model tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModel.from_pretrained(model_name) # Load model and tokenizer for embedding model_name = "sentence-transformers/all-MiniLM-L6-v2" # Efficient model for embeddings tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModel.from_pretrained(model_name) # Initialize FAISS index for efficient similarity search embedding_dim = 384 # Dimension of MiniLM embeddings index = faiss.IndexFlatL2(embedding_dim) document_chunks = [] chunk_mappings = [] def embed_text(text): """Generate embeddings for a text chunk.""" inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True) with torch.no_grad(): embeddings = model(**inputs).last_hidden_state.mean(dim=1) return embeddings.numpy() def extract_text_from_pdf(file_path): """Extract text from a PDF file.""" text = "" with fitz.open(file_path) as pdf: for page in pdf: text += page.get_text("text") return text def chunk_text(text, chunk_size=500): """Divide the text into manageable chunks.""" return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)] def index_pdf(file_path): """Process a PDF file, create embeddings, and store them in FAISS index.""" text = extract_text_from_pdf(file_path) chunks = chunk_text(text) for i, chunk in enumerate(chunks): chunk_embedding = embed_text(chunk) index.add(chunk_embedding) # Add to FAISS index document_chunks.append(chunk) chunk_mappings.append((file_path, i)) # Track chunk-to-file mappings print(f"Indexed {len(chunks)} chunks from {file_path}") def search(query, top_k=5): """Search for relevant document chunks based on query.""" query_embedding = embed_text(query) distances, indices = index.search(query_embedding, top_k) results = [] for dist, idx in zip(distances[0], indices[0]): file_path, chunk_idx = chunk_mappings[idx] results.append({"file": file_path, "text": document_chunks[idx], "distance": dist}) return results # Streamlit interface st.title("RAG PDF Search System") # Upload PDF files uploaded_files = st.file_uploader("Upload PDF files", type="pdf", accept_multiple_files=True) if uploaded_files: for uploaded_file in uploaded_files: file_path = f"temp_{uploaded_file.name}" with open(file_path, "wb") as f: f.write(uploaded_file.getbuffer()) index_pdf(file_path) # Query input query = st.text_input("Enter your search query:") if query: results = search(query) for result in results: st.write(f"**File:** {result['file']}") st.write(result["text"]) st.write(f"**Relevance Score:** {result['distance']}\n") # Install necessary packages pip install transformers faiss-cpu PyMuPDF streamlit pyngrok # Write the Streamlit app code to a file named "app.py" app_code = """ import fitz # PyMuPDF for PDF handling from transformers import AutoTokenizer, AutoModel import faiss import torch import streamlit as st # Load model and tokenizer for embedding model_name = "sentence-transformers/all-MiniLM-L6-v2" # Efficient model for embeddings tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModel.from_pretrained(model_name) # Initialize FAISS index for similarity search embedding_dim = 384 # Dimension of MiniLM embeddings index = faiss.IndexFlatL2(embedding_dim) document_chunks = [] chunk_mappings = [] def embed_text(text): \"\"\"Generate embeddings for a text chunk.\"\"\" inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True) with torch.no_grad(): embeddings = model(**inputs).last_hidden_state.mean(dim=1) return embeddings.numpy() def extract_text_from_pdf(file_path): \"\"\"Extract text from a PDF file.\"\"\" text = "" with fitz.open(file_path) as pdf: for page in pdf: text += page.get_text("text") return text def chunk_text(text, chunk_size=500): \"\"\"Divide the text into manageable chunks.\"\"\" return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)] def index_pdf(file_path): \"\"\"Process a PDF file, create embeddings, and store them in FAISS index.\"\"\" text = extract_text_from_pdf(file_path) chunks = chunk_text(text) for i, chunk in enumerate(chunks): chunk_embedding = embed_text(chunk) index.add(chunk_embedding) # Add to FAISS index document_chunks.append(chunk) chunk_mappings.append((file_path, i)) # Track chunk-to-file mappings print(f"Indexed {len(chunks)} chunks from {file_path}") def search(query, top_k=5): \"\"\"Search for relevant document chunks based on query.\"\"\" query_embedding = embed_text(query) distances, indices = index.search(query_embedding, top_k) results = [] for dist, idx in zip(distances[0], indices[0]): file_path, chunk_idx = chunk_mappings[idx] results.append({"file": file_path, "text": document_chunks[idx], "distance": dist}) return results # Streamlit interface st.title("RAG PDF Search System") # Upload PDF files uploaded_files = st.file_uploader("Upload PDF files", type="pdf", accept_multiple_files=True) if uploaded_files: for uploaded_file in uploaded_files: file_path = f"temp_{uploaded_file.name}" with open(file_path, "wb") as f: f.write(uploaded_file.getbuffer()) index_pdf(file_path) # Query input query = st.text_input("Enter your search query:") if query: results = search(query) for result in results: st.write(f"**File:** {result['file']}") st.write(result["text"]) st.write(f"**Relevance Score:** {result['distance']}\n") """ # Save the code to app.py with open("app.py", "w") as file: file.write(app_code) # Set up and start Ngrok for public access from pyngrok import ngrok ngrok.set_auth_token("YOUR_NGROK_AUTH_TOKEN") # Replace with your Ngrok auth token public_url = ngrok.connect(port="8501") print(f"Streamlit app is running at: {public_url}") # Run the Streamlit app !streamlit run app.py --server.port 8501