Spaces:
Runtime error
Runtime error
| import os | |
| import pdfplumber | |
| from langchain_core.documents import Document | |
| from langchain_text_splitters import RecursiveCharacterTextSplitter | |
| from langchain_huggingface import HuggingFaceEmbeddings | |
| from langchain_chroma import Chroma | |
| import shutil | |
| import warnings | |
| import logging | |
| # Suppress pdfplumber warnings about PDF parsing issues | |
| warnings.filterwarnings("ignore") | |
| logging.getLogger("pdfplumber").setLevel(logging.ERROR) | |
| DATA_PATH = "data/impots.pdf" | |
| CHROMA_PATH = "chroma_db" | |
| def load_documents(): | |
| documents = [] | |
| with pdfplumber.open(DATA_PATH) as pdf: | |
| for i, page in enumerate(pdf.pages): | |
| text = page.extract_text() or "" | |
| tables = page.extract_tables() or [] | |
| table_texts = [] | |
| for table in tables: | |
| if not table or not table[0]: | |
| continue | |
| # Convert table to markdown - handle None values in cells | |
| header_row = [str(cell) if cell is not None else "" for cell in table[0]] | |
| md_table = "| " + " | ".join(header_row) + " |\n" | |
| md_table += "| " + " | ".join(["---"]*len(header_row)) + " |\n" | |
| for row in table[1:]: | |
| row_cells = [str(cell) if cell is not None else "" for cell in row] | |
| md_table += "| " + " | ".join(row_cells) + " |\n" | |
| table_texts.append(md_table) | |
| full_page = text + "\n\n" + "\n\n".join(table_texts) | |
| documents.append(Document(page_content=full_page, metadata={"page": i+1})) | |
| return documents | |
| def create_db(): | |
| documents = load_documents() | |
| text_splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=1000, | |
| chunk_overlap=200, | |
| length_function=len, | |
| add_start_index=True, | |
| ) | |
| chunks = text_splitter.split_documents(documents) | |
| print(f"Loaded {len(documents)} document(s)") | |
| print(f"Split into {len(chunks)} chunks") | |
| # Clear existing DB | |
| if os.path.exists(CHROMA_PATH): | |
| print(f"\nClearing existing database at {CHROMA_PATH}...") | |
| shutil.rmtree(CHROMA_PATH) | |
| print("\nCreating ChromaDB vector store with HuggingFace embeddings (all-MiniLM-L6-v2)...") | |
| embeddings = HuggingFaceEmbeddings( | |
| model_name="dangvantuan/sentence-camembert-base" # French-specific embeddings | |
| ) | |
| vectorstore = Chroma.from_documents( | |
| documents=chunks, | |
| embedding=embeddings, | |
| persist_directory=CHROMA_PATH | |
| ) | |
| print(f"β Successfully created ChromaDB with {len(chunks)} chunks!") | |
| print(f"π Database saved to: {CHROMA_PATH}") | |
| return vectorstore | |
| if __name__ == "__main__": | |
| create_db() | |