Spaces:

Sof850
/

RAG

Runtime error

File size: 2,695 Bytes

import os
import pdfplumber
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma
import shutil
import warnings
import logging

# Suppress pdfplumber warnings about PDF parsing issues
warnings.filterwarnings("ignore")
logging.getLogger("pdfplumber").setLevel(logging.ERROR)

DATA_PATH = "data/impots.pdf"
CHROMA_PATH = "chroma_db"

def load_documents():
    documents = []
    with pdfplumber.open(DATA_PATH) as pdf:
        for i, page in enumerate(pdf.pages):
            text = page.extract_text() or ""
            tables = page.extract_tables() or []
            table_texts = []
            for table in tables:
                if not table or not table[0]:
                    continue
                # Convert table to markdown - handle None values in cells
                header_row = [str(cell) if cell is not None else "" for cell in table[0]]
                md_table = "| " + " | ".join(header_row) + " |\n"
                md_table += "| " + " | ".join(["---"]*len(header_row)) + " |\n"
                for row in table[1:]:
                    row_cells = [str(cell) if cell is not None else "" for cell in row]
                    md_table += "| " + " | ".join(row_cells) + " |\n"
                table_texts.append(md_table)
            full_page = text + "\n\n" + "\n\n".join(table_texts)
            documents.append(Document(page_content=full_page, metadata={"page": i+1}))
    return documents

def create_db():
    documents = load_documents()
    
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200,
        length_function=len,
        add_start_index=True,
    )
    chunks = text_splitter.split_documents(documents)
    
    print(f"Loaded {len(documents)} document(s)")
    print(f"Split into {len(chunks)} chunks")
    
    # Clear existing DB
    if os.path.exists(CHROMA_PATH):
        print(f"\nClearing existing database at {CHROMA_PATH}...")
        shutil.rmtree(CHROMA_PATH)
    
    print("\nCreating ChromaDB vector store with HuggingFace embeddings (all-MiniLM-L6-v2)...")
    embeddings = HuggingFaceEmbeddings(
        model_name="dangvantuan/sentence-camembert-base"  # French-specific embeddings
    )
    
    vectorstore = Chroma.from_documents(
        documents=chunks,
        embedding=embeddings,
        persist_directory=CHROMA_PATH
    )
    
    print(f"✅ Successfully created ChromaDB with {len(chunks)} chunks!")
    print(f"📁 Database saved to: {CHROMA_PATH}")
    return vectorstore

if __name__ == "__main__":
    create_db()