Spaces:
Sleeping
Sleeping
| import os | |
| import time | |
| from langchain_community.vectorstores import Chroma | |
| from langchain_community.embeddings import HuggingFaceEmbeddings | |
| # Paramètres | |
| DB_PATH = os.path.abspath("../../db") | |
| EMBEDDING_MODEL = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2" | |
| ARCHIVE_DIR = os.path.abspath("../../data/archives_mails") | |
| print("[INFO] Chargement des mails depuis :", ARCHIVE_DIR) | |
| mail_files = [f for f in os.listdir(ARCHIVE_DIR) if os.path.isfile(os.path.join(ARCHIVE_DIR, f))] | |
| print(f"[INFO] {len(mail_files)} fichiers trouvés.") | |
| documents = [] | |
| metadatas = [] | |
| for idx, filename in enumerate(mail_files): | |
| file_path = os.path.join(ARCHIVE_DIR, filename) | |
| try: | |
| with open(file_path, 'r', encoding='utf-8') as f: | |
| content = f.read() | |
| except Exception as e: | |
| print(f"[WARN] Impossible de lire {filename} : {e}") | |
| continue | |
| if not content.strip(): | |
| print(f"[WARN] Fichier vide ignoré : {filename}") | |
| continue | |
| # Pas de découpage pour les mails : chaque mail = 1 document | |
| documents.append(content) | |
| metadatas.append({ | |
| "source": "archive_mail", | |
| "filename": filename | |
| }) | |
| print(f"[INFO] {len(documents)} mails ajoutés (1 document par mail, pas de découpage).") | |
| print(f"[INFO] Chargement des embeddings ({EMBEDDING_MODEL})...") | |
| embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL) | |
| # Charger ou créer la base Chroma existante | |
| if os.path.exists(DB_PATH): | |
| print(f"[INFO] Ouverture de la base vectorielle existante : {DB_PATH}") | |
| db = Chroma(persist_directory=DB_PATH, embedding_function=embeddings) | |
| else: | |
| print(f"[INFO] Création d'une nouvelle base vectorielle : {DB_PATH}") | |
| os.makedirs(DB_PATH, exist_ok=True) | |
| db = Chroma(persist_directory=DB_PATH, embedding_function=embeddings) | |
| # Ajout des nouveaux documents | |
| print("[INFO] Ajout des nouveaux mails à la base vectorielle...") | |
| t0 = time.time() | |
| db.add_texts(documents, metadatas=metadatas) | |
| db.persist() | |
| t1 = time.time() | |
| print(f"[SUCCESS] {len(documents)} chunks de mails ajoutés à la base vectorielle en {t1-t0:.1f} secondes.") | |
| # Affichage du total de documents dans la base | |
| try: | |
| total_docs = db._collection.count() | |
| print(f"[INFO] Total de documents dans la base vectorielle après ajout : {total_docs}") | |
| except Exception as e: | |
| print(f"[WARN] Impossible de compter le nombre total de documents : {e}") | |
| print(f"[INFO] La base vectorielle est prête dans : {DB_PATH}") | |