Spaces:
Runtime error
Runtime error
File size: 2,695 Bytes
6e52b29 3e09ea4 0472254 f516652 0472254 6e52b29 38854c4 0472254 dfe7d61 0472254 6e52b29 1d88fd5 6e52b29 1d88fd5 6e52b29 1d88fd5 6e52b29 0472254 6e52b29 0472254 6e52b29 0472254 6e52b29 0472254 6e52b29 0472254 6e52b29 06c53c9 fc8e15c 06c53c9 6e52b29 0472254 38854c4 0472254 6e52b29 0472254 6e52b29 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 | import os
import pdfplumber
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma
import shutil
import warnings
import logging
# Suppress pdfplumber warnings about PDF parsing issues
warnings.filterwarnings("ignore")
logging.getLogger("pdfplumber").setLevel(logging.ERROR)
DATA_PATH = "data/impots.pdf"
CHROMA_PATH = "chroma_db"
def load_documents():
documents = []
with pdfplumber.open(DATA_PATH) as pdf:
for i, page in enumerate(pdf.pages):
text = page.extract_text() or ""
tables = page.extract_tables() or []
table_texts = []
for table in tables:
if not table or not table[0]:
continue
# Convert table to markdown - handle None values in cells
header_row = [str(cell) if cell is not None else "" for cell in table[0]]
md_table = "| " + " | ".join(header_row) + " |\n"
md_table += "| " + " | ".join(["---"]*len(header_row)) + " |\n"
for row in table[1:]:
row_cells = [str(cell) if cell is not None else "" for cell in row]
md_table += "| " + " | ".join(row_cells) + " |\n"
table_texts.append(md_table)
full_page = text + "\n\n" + "\n\n".join(table_texts)
documents.append(Document(page_content=full_page, metadata={"page": i+1}))
return documents
def create_db():
documents = load_documents()
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1000,
chunk_overlap=200,
length_function=len,
add_start_index=True,
)
chunks = text_splitter.split_documents(documents)
print(f"Loaded {len(documents)} document(s)")
print(f"Split into {len(chunks)} chunks")
# Clear existing DB
if os.path.exists(CHROMA_PATH):
print(f"\nClearing existing database at {CHROMA_PATH}...")
shutil.rmtree(CHROMA_PATH)
print("\nCreating ChromaDB vector store with HuggingFace embeddings (all-MiniLM-L6-v2)...")
embeddings = HuggingFaceEmbeddings(
model_name="dangvantuan/sentence-camembert-base" # French-specific embeddings
)
vectorstore = Chroma.from_documents(
documents=chunks,
embedding=embeddings,
persist_directory=CHROMA_PATH
)
print(f"β
Successfully created ChromaDB with {len(chunks)} chunks!")
print(f"π Database saved to: {CHROMA_PATH}")
return vectorstore
if __name__ == "__main__":
create_db()
|