Spaces:

Sof850
/

RAG

Runtime error

App Files Files Community

RAG / data_cutter.py

Kakarot21

feat: Switch to French-specific models and refine prompt engineering for improved French RAG.

fc8e15c 2 months ago

raw

history blame contribute delete

2.7 kB

	import os
	import pdfplumber
	from langchain_core.documents import Document
	from langchain_text_splitters import RecursiveCharacterTextSplitter
	from langchain_huggingface import HuggingFaceEmbeddings
	from langchain_chroma import Chroma
	import shutil
	import warnings
	import logging

	# Suppress pdfplumber warnings about PDF parsing issues
	warnings.filterwarnings("ignore")
	logging.getLogger("pdfplumber").setLevel(logging.ERROR)

	DATA_PATH = "data/impots.pdf"
	CHROMA_PATH = "chroma_db"

	def load_documents():
	documents = []
	with pdfplumber.open(DATA_PATH) as pdf:
	for i, page in enumerate(pdf.pages):
	text = page.extract_text() or ""
	tables = page.extract_tables() or []
	table_texts = []
	for table in tables:
	if not table or not table[0]:
	continue
	# Convert table to markdown - handle None values in cells
	header_row = [str(cell) if cell is not None else "" for cell in table[0]]
	md_table = "\| " + " \| ".join(header_row) + " \|\n"
	md_table += "\| " + " \| ".join(["---"]*len(header_row)) + " \|\n"
	for row in table[1:]:
	row_cells = [str(cell) if cell is not None else "" for cell in row]
	md_table += "\| " + " \| ".join(row_cells) + " \|\n"
	table_texts.append(md_table)
	full_page = text + "\n\n" + "\n\n".join(table_texts)
	documents.append(Document(page_content=full_page, metadata={"page": i+1}))
	return documents

	def create_db():
	documents = load_documents()

	text_splitter = RecursiveCharacterTextSplitter(
	chunk_size=1000,
	chunk_overlap=200,
	length_function=len,
	add_start_index=True,
	)
	chunks = text_splitter.split_documents(documents)

	print(f"Loaded {len(documents)} document(s)")
	print(f"Split into {len(chunks)} chunks")

	# Clear existing DB
	if os.path.exists(CHROMA_PATH):
	print(f"\nClearing existing database at {CHROMA_PATH}...")
	shutil.rmtree(CHROMA_PATH)

	print("\nCreating ChromaDB vector store with HuggingFace embeddings (all-MiniLM-L6-v2)...")
	embeddings = HuggingFaceEmbeddings(
	model_name="dangvantuan/sentence-camembert-base" # French-specific embeddings
	)

	vectorstore = Chroma.from_documents(
	documents=chunks,
	embedding=embeddings,
	persist_directory=CHROMA_PATH
	)

	print(f"✅ Successfully created ChromaDB with {len(chunks)} chunks!")
	print(f"📁 Database saved to: {CHROMA_PATH}")
	return vectorstore

	if __name__ == "__main__":
	create_db()