Spaces:
Sleeping
Sleeping
| import fitz # PyMuPDF for PDF handling | |
| from transformers import AutoTokenizer, AutoModel | |
| import faiss | |
| import torch | |
| import streamlit as st | |
| # Define model name | |
| model_name = "bert-base-uncased" # Replace with your desired model | |
| # Initialize tokenizer and model | |
| tokenizer = AutoTokenizer.from_pretrained(model_name) | |
| model = AutoModel.from_pretrained(model_name) | |
| # Load model and tokenizer for embedding | |
| model_name = "sentence-transformers/all-MiniLM-L6-v2" # Efficient model for embeddings | |
| tokenizer = AutoTokenizer.from_pretrained(model_name) | |
| model = AutoModel.from_pretrained(model_name) | |
| # Initialize FAISS index for efficient similarity search | |
| embedding_dim = 384 # Dimension of MiniLM embeddings | |
| index = faiss.IndexFlatL2(embedding_dim) | |
| document_chunks = [] | |
| chunk_mappings = [] | |
| def embed_text(text): | |
| """Generate embeddings for a text chunk.""" | |
| inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True) | |
| with torch.no_grad(): | |
| embeddings = model(**inputs).last_hidden_state.mean(dim=1) | |
| return embeddings.numpy() | |
| def extract_text_from_pdf(file_path): | |
| """Extract text from a PDF file.""" | |
| text = "" | |
| with fitz.open(file_path) as pdf: | |
| for page in pdf: | |
| text += page.get_text("text") | |
| return text | |
| def chunk_text(text, chunk_size=500): | |
| """Divide the text into manageable chunks.""" | |
| return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)] | |
| def index_pdf(file_path): | |
| """Process a PDF file, create embeddings, and store them in FAISS index.""" | |
| text = extract_text_from_pdf(file_path) | |
| chunks = chunk_text(text) | |
| for i, chunk in enumerate(chunks): | |
| chunk_embedding = embed_text(chunk) | |
| index.add(chunk_embedding) # Add to FAISS index | |
| document_chunks.append(chunk) | |
| chunk_mappings.append((file_path, i)) # Track chunk-to-file mappings | |
| print(f"Indexed {len(chunks)} chunks from {file_path}") | |
| def search(query, top_k=5): | |
| """Search for relevant document chunks based on query.""" | |
| query_embedding = embed_text(query) | |
| distances, indices = index.search(query_embedding, top_k) | |
| results = [] | |
| for dist, idx in zip(distances[0], indices[0]): | |
| file_path, chunk_idx = chunk_mappings[idx] | |
| results.append({"file": file_path, "text": document_chunks[idx], "distance": dist}) | |
| return results | |
| # Streamlit interface | |
| st.title("RAG PDF Search System") | |
| # Upload PDF files | |
| uploaded_files = st.file_uploader("Upload PDF files", type="pdf", accept_multiple_files=True) | |
| if uploaded_files: | |
| for uploaded_file in uploaded_files: | |
| file_path = f"temp_{uploaded_file.name}" | |
| with open(file_path, "wb") as f: | |
| f.write(uploaded_file.getbuffer()) | |
| index_pdf(file_path) | |
| # Query input | |
| query = st.text_input("Enter your search query:") | |
| if query: | |
| results = search(query) | |
| for result in results: | |
| st.write(f"**File:** {result['file']}") | |
| st.write(result["text"]) | |
| st.write(f"**Relevance Score:** {result['distance']}\n") | |
| # Install necessary packages | |
| pip install transformers faiss-cpu PyMuPDF streamlit pyngrok | |
| # Write the Streamlit app code to a file named "app.py" | |
| app_code = """ | |
| import fitz # PyMuPDF for PDF handling | |
| from transformers import AutoTokenizer, AutoModel | |
| import faiss | |
| import torch | |
| import streamlit as st | |
| # Load model and tokenizer for embedding | |
| model_name = "sentence-transformers/all-MiniLM-L6-v2" # Efficient model for embeddings | |
| tokenizer = AutoTokenizer.from_pretrained(model_name) | |
| model = AutoModel.from_pretrained(model_name) | |
| # Initialize FAISS index for similarity search | |
| embedding_dim = 384 # Dimension of MiniLM embeddings | |
| index = faiss.IndexFlatL2(embedding_dim) | |
| document_chunks = [] | |
| chunk_mappings = [] | |
| def embed_text(text): | |
| \"\"\"Generate embeddings for a text chunk.\"\"\" | |
| inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True) | |
| with torch.no_grad(): | |
| embeddings = model(**inputs).last_hidden_state.mean(dim=1) | |
| return embeddings.numpy() | |
| def extract_text_from_pdf(file_path): | |
| \"\"\"Extract text from a PDF file.\"\"\" | |
| text = "" | |
| with fitz.open(file_path) as pdf: | |
| for page in pdf: | |
| text += page.get_text("text") | |
| return text | |
| def chunk_text(text, chunk_size=500): | |
| \"\"\"Divide the text into manageable chunks.\"\"\" | |
| return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)] | |
| def index_pdf(file_path): | |
| \"\"\"Process a PDF file, create embeddings, and store them in FAISS index.\"\"\" | |
| text = extract_text_from_pdf(file_path) | |
| chunks = chunk_text(text) | |
| for i, chunk in enumerate(chunks): | |
| chunk_embedding = embed_text(chunk) | |
| index.add(chunk_embedding) # Add to FAISS index | |
| document_chunks.append(chunk) | |
| chunk_mappings.append((file_path, i)) # Track chunk-to-file mappings | |
| print(f"Indexed {len(chunks)} chunks from {file_path}") | |
| def search(query, top_k=5): | |
| \"\"\"Search for relevant document chunks based on query.\"\"\" | |
| query_embedding = embed_text(query) | |
| distances, indices = index.search(query_embedding, top_k) | |
| results = [] | |
| for dist, idx in zip(distances[0], indices[0]): | |
| file_path, chunk_idx = chunk_mappings[idx] | |
| results.append({"file": file_path, "text": document_chunks[idx], "distance": dist}) | |
| return results | |
| # Streamlit interface | |
| st.title("RAG PDF Search System") | |
| # Upload PDF files | |
| uploaded_files = st.file_uploader("Upload PDF files", type="pdf", accept_multiple_files=True) | |
| if uploaded_files: | |
| for uploaded_file in uploaded_files: | |
| file_path = f"temp_{uploaded_file.name}" | |
| with open(file_path, "wb") as f: | |
| f.write(uploaded_file.getbuffer()) | |
| index_pdf(file_path) | |
| # Query input | |
| query = st.text_input("Enter your search query:") | |
| if query: | |
| results = search(query) | |
| for result in results: | |
| st.write(f"**File:** {result['file']}") | |
| st.write(result["text"]) | |
| st.write(f"**Relevance Score:** {result['distance']}\n") | |
| """ | |
| # Save the code to app.py | |
| with open("app.py", "w") as file: | |
| file.write(app_code) | |
| # Set up and start Ngrok for public access | |
| from pyngrok import ngrok | |
| ngrok.set_auth_token("YOUR_NGROK_AUTH_TOKEN") # Replace with your Ngrok auth token | |
| public_url = ngrok.connect(port="8501") | |
| print(f"Streamlit app is running at: {public_url}") | |
| # Run the Streamlit app | |
| !streamlit run app.py --server.port 8501 | |