File size: 2,695 Bytes
6e52b29
 
3e09ea4
0472254
f516652
0472254
6e52b29
38854c4
 
 
 
 
 
0472254
dfe7d61
0472254
 
 
6e52b29
 
 
 
 
 
 
1d88fd5
6e52b29
1d88fd5
 
 
 
6e52b29
1d88fd5
 
6e52b29
 
 
0472254
 
 
 
6e52b29
0472254
6e52b29
 
 
 
0472254
 
6e52b29
0472254
 
6e52b29
 
0472254
 
 
6e52b29
 
06c53c9
fc8e15c
06c53c9
6e52b29
0472254
 
38854c4
0472254
 
6e52b29
0472254
 
 
 
 
6e52b29
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import os
import pdfplumber
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma
import shutil
import warnings
import logging

# Suppress pdfplumber warnings about PDF parsing issues
warnings.filterwarnings("ignore")
logging.getLogger("pdfplumber").setLevel(logging.ERROR)

DATA_PATH = "data/impots.pdf"
CHROMA_PATH = "chroma_db"

def load_documents():
    documents = []
    with pdfplumber.open(DATA_PATH) as pdf:
        for i, page in enumerate(pdf.pages):
            text = page.extract_text() or ""
            tables = page.extract_tables() or []
            table_texts = []
            for table in tables:
                if not table or not table[0]:
                    continue
                # Convert table to markdown - handle None values in cells
                header_row = [str(cell) if cell is not None else "" for cell in table[0]]
                md_table = "| " + " | ".join(header_row) + " |\n"
                md_table += "| " + " | ".join(["---"]*len(header_row)) + " |\n"
                for row in table[1:]:
                    row_cells = [str(cell) if cell is not None else "" for cell in row]
                    md_table += "| " + " | ".join(row_cells) + " |\n"
                table_texts.append(md_table)
            full_page = text + "\n\n" + "\n\n".join(table_texts)
            documents.append(Document(page_content=full_page, metadata={"page": i+1}))
    return documents

def create_db():
    documents = load_documents()
    
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200,
        length_function=len,
        add_start_index=True,
    )
    chunks = text_splitter.split_documents(documents)
    
    print(f"Loaded {len(documents)} document(s)")
    print(f"Split into {len(chunks)} chunks")
    
    # Clear existing DB
    if os.path.exists(CHROMA_PATH):
        print(f"\nClearing existing database at {CHROMA_PATH}...")
        shutil.rmtree(CHROMA_PATH)
    
    print("\nCreating ChromaDB vector store with HuggingFace embeddings (all-MiniLM-L6-v2)...")
    embeddings = HuggingFaceEmbeddings(
        model_name="dangvantuan/sentence-camembert-base"  # French-specific embeddings
    )
    
    vectorstore = Chroma.from_documents(
        documents=chunks,
        embedding=embeddings,
        persist_directory=CHROMA_PATH
    )
    
    print(f"βœ… Successfully created ChromaDB with {len(chunks)} chunks!")
    print(f"πŸ“ Database saved to: {CHROMA_PATH}")
    return vectorstore

if __name__ == "__main__":
    create_db()