Spaces:

KunalShaw
/

RAG-Chatbot-Agentic-AI

Sleeping

File size: 10,915 Bytes

f9c215a

"""
ingest.py - PDF Ingestion Pipeline

This script handles the complete ingestion workflow:
1. Read PDF file and extract text by page
2. Clean the extracted text
3. Chunk the text with overlap (500 tokens, 50-100 overlap)
4. Generate embeddings using sentence-transformers
5. Upsert to Pinecone (or save locally with --local-only)
6. Save chunks.jsonl as backup

Usage:
    python app/ingest.py --pdf ./data/Ebook-Agentic-AI.pdf --index agentic-ai
    python app/ingest.py --pdf ./data/Ebook-Agentic-AI.pdf --local-only  # No Pinecone

Requires:
    - PINECONE_API_KEY environment variable (unless using --local-only)
    - PDF file at specified path
"""

import os
import sys
import argparse
from typing import List, Dict, Tuple
from tqdm import tqdm
from dotenv import load_dotenv

# Add parent directory to path for imports
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

# Local imports
from app.utils import clean_text, chunk_text, save_chunks_to_jsonl
from app.vectorstore import get_vector_store, PineconeVectorStore, LocalVectorStore

# Load environment variables
load_dotenv()

# Try to import PDF library
try:
    import pdfplumber
    PDF_LIBRARY = "pdfplumber"
except ImportError:
    try:
        import PyPDF2
        PDF_LIBRARY = "PyPDF2"
    except ImportError:
        print("ERROR: Neither pdfplumber nor PyPDF2 installed. Please install one.")
        sys.exit(1)

# Embedding model
try:
    from sentence_transformers import SentenceTransformer
    EMBEDDING_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
    EMBEDDING_DIM = 384
except ImportError:
    print("ERROR: sentence-transformers not installed")
    sys.exit(1)


def extract_text_from_pdf(pdf_path: str) -> List[Tuple[int, str]]:
    """
    Extract text from PDF file, returning text by page.
    
    Args:
        pdf_path: Path to the PDF file
        
    Returns:
        List of tuples: (page_number, page_text)
    """
    print(f"Extracting text from: {pdf_path}")
    pages = []
    
    if PDF_LIBRARY == "pdfplumber":
        with pdfplumber.open(pdf_path) as pdf:
            for i, page in enumerate(pdf.pages):
                text = page.extract_text() or ""
                pages.append((i + 1, text))  # 1-indexed page numbers
                
    elif PDF_LIBRARY == "PyPDF2":
        import PyPDF2
        with open(pdf_path, 'rb') as f:
            reader = PyPDF2.PdfReader(f)
            for i, page in enumerate(reader.pages):
                text = page.extract_text() or ""
                pages.append((i + 1, text))
    
    print(f"Extracted {len(pages)} pages")
    return pages


def load_embedding_model():
    """
    Load the sentence-transformers embedding model.
    
    Returns:
        SentenceTransformer model instance
    """
    print(f"Loading embedding model: {EMBEDDING_MODEL_NAME}")
    model = SentenceTransformer(EMBEDDING_MODEL_NAME)
    print(f"Model loaded! Embedding dimension: {model.get_sentence_embedding_dimension()}")
    return model


def generate_embeddings(
    chunks: List[Dict],
    model: SentenceTransformer,
    batch_size: int = 32
) -> List[Dict]:
    """
    Generate embeddings for all chunks.
    
    Args:
        chunks: List of chunk dictionaries (must have 'text' key)
        model: SentenceTransformer model
        batch_size: Batch size for embedding generation
        
    Returns:
        Chunks with 'embedding' field added
    """
    print(f"Generating embeddings for {len(chunks)} chunks...")
    
    # Extract texts
    texts = [chunk['text'] for chunk in chunks]
    
    # Generate embeddings in batches
    embeddings = model.encode(
        texts,
        batch_size=batch_size,
        show_progress_bar=True,
        convert_to_numpy=True
    )
    
    # Add embeddings to chunks
    for i, chunk in enumerate(chunks):
        chunk['embedding'] = embeddings[i].tolist()
    
    print(f"Generated {len(embeddings)} embeddings")
    return chunks


def run_ingestion(
    pdf_path: str,
    index_name: str = "agentic-ai-ebook",
    namespace: str = "agentic-ai",
    chunk_size: int = 500,
    chunk_overlap: int = 50,
    local_only: bool = False,
    output_dir: str = "./data"
):
    """
    Run the complete ingestion pipeline.
    
    Args:
        pdf_path: Path to the PDF file
        index_name: Pinecone index name
        namespace: Pinecone namespace
        chunk_size: Target chunk size in tokens
        chunk_overlap: Overlap between chunks in tokens
        local_only: If True, skip Pinecone and save locally only
        output_dir: Directory for output files
    """
    print("=" * 60)
    print("RAG Ingestion Pipeline")
    print("=" * 60)
    
    # Ensure output directory exists
    os.makedirs(output_dir, exist_ok=True)
    
    # Step 1: Extract text from PDF
    print("\n[Step 1/5] Extracting text from PDF...")
    pages = extract_text_from_pdf(pdf_path)
    
    if not pages:
        print("ERROR: No text extracted from PDF")
        return
    
    # Step 2: Clean and chunk text
    print("\n[Step 2/5] Cleaning and chunking text...")
    all_chunks = []
    source_name = os.path.basename(pdf_path)
    
    for page_num, page_text in tqdm(pages, desc="Processing pages"):
        # Clean the text
        cleaned_text = clean_text(page_text)
        
        if not cleaned_text.strip():
            continue
        
        # Chunk the text
        page_chunks = chunk_text(
            text=cleaned_text,
            page_number=page_num,
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            source=source_name
        )
        
        all_chunks.extend(page_chunks)
    
    print(f"Created {len(all_chunks)} chunks from {len(pages)} pages")
    
    if not all_chunks:
        print("ERROR: No chunks created")
        return
    
    # Step 3: Load embedding model
    print("\n[Step 3/5] Loading embedding model...")
    embedding_model = load_embedding_model()
    
    # Step 4: Generate embeddings
    print("\n[Step 4/5] Generating embeddings...")
    chunks_with_embeddings = generate_embeddings(all_chunks, embedding_model)
    
    # Step 5: Store vectors
    print("\n[Step 5/5] Storing vectors...")
    
    if local_only:
        # Save to local files only
        print("Running in LOCAL-ONLY mode (no Pinecone)")
        
        # Save chunks to JSONL (without embeddings for smaller file)
        chunks_file = os.path.join(output_dir, "chunks.jsonl")
        save_chunks_to_jsonl(chunks_with_embeddings, chunks_file, include_embeddings=False)
        
        # Save to local vector store
        local_store = LocalVectorStore(dimension=EMBEDDING_DIM)
        local_store.upsert(chunks_with_embeddings)
        
        # Save vectors to file for later use
        vectors_file = os.path.join(output_dir, "vectors.json")
        local_store.save_to_file(vectors_file)
        
        print(f"\nLocal files saved to {output_dir}/")
        
    else:
        # Upsert to Pinecone
        api_key = os.getenv("PINECONE_API_KEY")
        
        if not api_key:
            print("ERROR: PINECONE_API_KEY not set. Use --local-only to run without Pinecone.")
            # Fall back to local only
            print("Falling back to local-only mode...")
            chunks_file = os.path.join(output_dir, "chunks.jsonl")
            save_chunks_to_jsonl(chunks_with_embeddings, chunks_file, include_embeddings=False)
            return
        
        # Initialize Pinecone vector store
        vector_store = PineconeVectorStore(
            api_key=api_key,
            index_name=index_name,
            namespace=namespace,
            dimension=EMBEDDING_DIM
        )
        
        # Create index if needed
        if not vector_store.create_index_if_missing():
            print("ERROR: Failed to create/connect to Pinecone index")
            return
        
        # Upsert vectors
        upserted = vector_store.upsert(chunks_with_embeddings)
        
        # Also save chunks locally as backup
        chunks_file = os.path.join(output_dir, "chunks.jsonl")
        save_chunks_to_jsonl(chunks_with_embeddings, chunks_file, include_embeddings=False)
        
        # Print stats
        stats = vector_store.get_index_stats()
        print(f"\nPinecone index stats: {stats}")
    
    print("\n" + "=" * 60)
    print("Ingestion complete!")
    print("=" * 60)
    print(f"- Total chunks: {len(chunks_with_embeddings)}")
    print(f"- Chunks file: {os.path.join(output_dir, 'chunks.jsonl')}")
    if not local_only:
        print(f"- Pinecone index: {index_name}")
        print(f"- Namespace: {namespace}")
    print("=" * 60)


def main():
    """Main entry point with argument parsing."""
    parser = argparse.ArgumentParser(
        description="Ingest PDF into vector store for RAG",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
    # Ingest to Pinecone (requires PINECONE_API_KEY env var)
    python app/ingest.py --pdf ./data/Ebook-Agentic-AI.pdf --index agentic-ai
    
    # Local-only mode (no Pinecone needed)
    python app/ingest.py --pdf ./data/Ebook-Agentic-AI.pdf --local-only
    
    # Custom chunk size
    python app/ingest.py --pdf ./data/Ebook-Agentic-AI.pdf --chunk-size 400 --overlap 75
        """
    )
    
    parser.add_argument(
        "--pdf",
        type=str,
        required=True,
        help="Path to the PDF file to ingest"
    )
    
    parser.add_argument(
        "--index",
        type=str,
        default="agentic-ai-ebook",
        help="Pinecone index name (default: agentic-ai-ebook)"
    )
    
    parser.add_argument(
        "--namespace",
        type=str,
        default="agentic-ai",
        help="Pinecone namespace (default: agentic-ai)"
    )
    
    parser.add_argument(
        "--chunk-size",
        type=int,
        default=500,
        help="Target chunk size in tokens (default: 500)"
    )
    
    parser.add_argument(
        "--overlap",
        type=int,
        default=50,
        help="Chunk overlap in tokens (default: 50)"
    )
    
    parser.add_argument(
        "--local-only",
        action="store_true",
        help="Run without Pinecone, save vectors locally"
    )
    
    parser.add_argument(
        "--output-dir",
        type=str,
        default="./data",
        help="Output directory for local files (default: ./data)"
    )
    
    args = parser.parse_args()
    
    # Validate PDF path
    if not os.path.exists(args.pdf):
        print(f"ERROR: PDF file not found: {args.pdf}")
        sys.exit(1)
    
    # Run ingestion
    run_ingestion(
        pdf_path=args.pdf,
        index_name=args.index,
        namespace=args.namespace,
        chunk_size=args.chunk_size,
        chunk_overlap=args.overlap,
        local_only=args.local_only,
        output_dir=args.output_dir
    )


if __name__ == "__main__":
    main()