Spaces:
Running
Running
| #!/usr/bin/env python3 | |
| """ | |
| Embed papers for RAG chatbot. | |
| Run this locally before deploying to HuggingFace Space. | |
| Usage: | |
| 1. Place your PDF papers in the papers/ directory | |
| 2. Run: python embed_papers.py | |
| 3. This creates index/faiss.index and index/chunks.json | |
| 4. Commit and push to HuggingFace Space | |
| """ | |
| import os | |
| import json | |
| import fitz # PyMuPDF | |
| import numpy as np | |
| from pathlib import Path | |
| from sentence_transformers import SentenceTransformer | |
| # Configuration | |
| PAPERS_DIR = Path("papers") | |
| INDEX_DIR = Path("index") | |
| CHUNK_SIZE = 500 # characters | |
| CHUNK_OVERLAP = 100 # characters | |
| EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2" | |
| def extract_text_from_pdf(pdf_path: Path) -> list[dict]: | |
| """Extract text from PDF with page numbers.""" | |
| doc = fitz.open(pdf_path) | |
| pages = [] | |
| for page_num, page in enumerate(doc, 1): | |
| text = page.get_text() | |
| if text.strip(): | |
| pages.append({ | |
| "text": text, | |
| "page": page_num, | |
| "source": pdf_path.stem | |
| }) | |
| doc.close() | |
| return pages | |
| def chunk_text(pages: list[dict], chunk_size: int = CHUNK_SIZE, overlap: int = CHUNK_OVERLAP) -> list[dict]: | |
| """Split pages into overlapping chunks.""" | |
| chunks = [] | |
| for page_data in pages: | |
| text = page_data["text"] | |
| source = page_data["source"] | |
| page_num = page_data["page"] | |
| # Split into chunks with overlap | |
| start = 0 | |
| while start < len(text): | |
| end = start + chunk_size | |
| chunk_text = text[start:end] | |
| # Try to break at sentence boundary | |
| if end < len(text): | |
| last_period = chunk_text.rfind('. ') | |
| if last_period > chunk_size // 2: | |
| chunk_text = chunk_text[:last_period + 1] | |
| end = start + last_period + 1 | |
| if chunk_text.strip(): | |
| chunks.append({ | |
| "text": chunk_text.strip(), | |
| "source": source, | |
| "page": page_num, | |
| "chunk_id": len(chunks) | |
| }) | |
| start = end - overlap if end < len(text) else len(text) | |
| return chunks | |
| def create_embeddings(chunks: list[dict], model: SentenceTransformer) -> np.ndarray: | |
| """Generate embeddings for all chunks.""" | |
| texts = [chunk["text"] for chunk in chunks] | |
| embeddings = model.encode(texts, show_progress_bar=True, convert_to_numpy=True) | |
| return embeddings | |
| def save_faiss_index(embeddings: np.ndarray, output_path: Path): | |
| """Save embeddings as FAISS index.""" | |
| import faiss | |
| # Normalize for cosine similarity | |
| faiss.normalize_L2(embeddings) | |
| # Create index | |
| dimension = embeddings.shape[1] | |
| index = faiss.IndexFlatIP(dimension) # Inner product (cosine after normalization) | |
| index.add(embeddings) | |
| # Save | |
| faiss.write_index(index, str(output_path)) | |
| print(f"Saved FAISS index with {index.ntotal} vectors to {output_path}") | |
| def main(): | |
| # Ensure directories exist | |
| INDEX_DIR.mkdir(exist_ok=True) | |
| # Find all PDFs | |
| pdf_files = list(PAPERS_DIR.glob("*.pdf")) | |
| if not pdf_files: | |
| print(f"No PDF files found in {PAPERS_DIR}/") | |
| print("Please add your research papers to the papers/ directory.") | |
| return | |
| print(f"Found {len(pdf_files)} PDF files:") | |
| for pdf in pdf_files: | |
| print(f" - {pdf.name}") | |
| # Extract and chunk | |
| all_chunks = [] | |
| for pdf_path in pdf_files: | |
| print(f"\nProcessing {pdf_path.name}...") | |
| pages = extract_text_from_pdf(pdf_path) | |
| chunks = chunk_text(pages) | |
| all_chunks.extend(chunks) | |
| print(f" Extracted {len(pages)} pages, {len(chunks)} chunks") | |
| print(f"\nTotal chunks: {len(all_chunks)}") | |
| # Load embedding model | |
| print(f"\nLoading embedding model: {EMBEDDING_MODEL}") | |
| model = SentenceTransformer(EMBEDDING_MODEL) | |
| # Generate embeddings | |
| print("Generating embeddings...") | |
| embeddings = create_embeddings(all_chunks, model) | |
| print(f"Embeddings shape: {embeddings.shape}") | |
| # Save FAISS index | |
| save_faiss_index(embeddings, INDEX_DIR / "faiss.index") | |
| # Save chunk metadata | |
| chunks_path = INDEX_DIR / "chunks.json" | |
| with open(chunks_path, "w", encoding="utf-8") as f: | |
| json.dump(all_chunks, f, ensure_ascii=False, indent=2) | |
| print(f"Saved chunk metadata to {chunks_path}") | |
| # Summary | |
| print("\n" + "="*50) | |
| print("DONE! Your index is ready.") | |
| print("="*50) | |
| print(f"\nFiles created:") | |
| print(f" - {INDEX_DIR}/faiss.index ({embeddings.shape[0]} vectors)") | |
| print(f" - {INDEX_DIR}/chunks.json ({len(all_chunks)} chunks)") | |
| print(f"\nNext steps:") | |
| print(" 1. Commit these files to your HuggingFace Space") | |
| print(" 2. The chatbot will use this index for retrieval") | |
| if __name__ == "__main__": | |
| main() | |