import os import pdfplumber from langchain_core.documents import Document from langchain_text_splitters import RecursiveCharacterTextSplitter from langchain_huggingface import HuggingFaceEmbeddings from langchain_chroma import Chroma import shutil import warnings import logging # Suppress pdfplumber warnings about PDF parsing issues warnings.filterwarnings("ignore") logging.getLogger("pdfplumber").setLevel(logging.ERROR) DATA_PATH = "data/impots.pdf" CHROMA_PATH = "chroma_db" def load_documents(): documents = [] with pdfplumber.open(DATA_PATH) as pdf: for i, page in enumerate(pdf.pages): text = page.extract_text() or "" tables = page.extract_tables() or [] table_texts = [] for table in tables: if not table or not table[0]: continue # Convert table to markdown - handle None values in cells header_row = [str(cell) if cell is not None else "" for cell in table[0]] md_table = "| " + " | ".join(header_row) + " |\n" md_table += "| " + " | ".join(["---"]*len(header_row)) + " |\n" for row in table[1:]: row_cells = [str(cell) if cell is not None else "" for cell in row] md_table += "| " + " | ".join(row_cells) + " |\n" table_texts.append(md_table) full_page = text + "\n\n" + "\n\n".join(table_texts) documents.append(Document(page_content=full_page, metadata={"page": i+1})) return documents def create_db(): documents = load_documents() text_splitter = RecursiveCharacterTextSplitter( chunk_size=1000, chunk_overlap=200, length_function=len, add_start_index=True, ) chunks = text_splitter.split_documents(documents) print(f"Loaded {len(documents)} document(s)") print(f"Split into {len(chunks)} chunks") # Clear existing DB if os.path.exists(CHROMA_PATH): print(f"\nClearing existing database at {CHROMA_PATH}...") shutil.rmtree(CHROMA_PATH) print("\nCreating ChromaDB vector store with HuggingFace embeddings (all-MiniLM-L6-v2)...") embeddings = HuggingFaceEmbeddings( model_name="dangvantuan/sentence-camembert-base" # French-specific embeddings ) vectorstore = Chroma.from_documents( documents=chunks, embedding=embeddings, persist_directory=CHROMA_PATH ) print(f"✅ Successfully created ChromaDB with {len(chunks)} chunks!") print(f"📁 Database saved to: {CHROMA_PATH}") return vectorstore if __name__ == "__main__": create_db()