RAG / data_cutter.py
Kakarot21's picture
feat: Switch to French-specific models and refine prompt engineering for improved French RAG.
fc8e15c
import os
import pdfplumber
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma
import shutil
import warnings
import logging
# Suppress pdfplumber warnings about PDF parsing issues
warnings.filterwarnings("ignore")
logging.getLogger("pdfplumber").setLevel(logging.ERROR)
DATA_PATH = "data/impots.pdf"
CHROMA_PATH = "chroma_db"
def load_documents():
documents = []
with pdfplumber.open(DATA_PATH) as pdf:
for i, page in enumerate(pdf.pages):
text = page.extract_text() or ""
tables = page.extract_tables() or []
table_texts = []
for table in tables:
if not table or not table[0]:
continue
# Convert table to markdown - handle None values in cells
header_row = [str(cell) if cell is not None else "" for cell in table[0]]
md_table = "| " + " | ".join(header_row) + " |\n"
md_table += "| " + " | ".join(["---"]*len(header_row)) + " |\n"
for row in table[1:]:
row_cells = [str(cell) if cell is not None else "" for cell in row]
md_table += "| " + " | ".join(row_cells) + " |\n"
table_texts.append(md_table)
full_page = text + "\n\n" + "\n\n".join(table_texts)
documents.append(Document(page_content=full_page, metadata={"page": i+1}))
return documents
def create_db():
documents = load_documents()
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1000,
chunk_overlap=200,
length_function=len,
add_start_index=True,
)
chunks = text_splitter.split_documents(documents)
print(f"Loaded {len(documents)} document(s)")
print(f"Split into {len(chunks)} chunks")
# Clear existing DB
if os.path.exists(CHROMA_PATH):
print(f"\nClearing existing database at {CHROMA_PATH}...")
shutil.rmtree(CHROMA_PATH)
print("\nCreating ChromaDB vector store with HuggingFace embeddings (all-MiniLM-L6-v2)...")
embeddings = HuggingFaceEmbeddings(
model_name="dangvantuan/sentence-camembert-base" # French-specific embeddings
)
vectorstore = Chroma.from_documents(
documents=chunks,
embedding=embeddings,
persist_directory=CHROMA_PATH
)
print(f"βœ… Successfully created ChromaDB with {len(chunks)} chunks!")
print(f"πŸ“ Database saved to: {CHROMA_PATH}")
return vectorstore
if __name__ == "__main__":
create_db()