Guillaumedbx commited on
Commit
6208436
·
1 Parent(s): a42113e

Remove unused Python scripts from the project

Browse files
archivemails.py DELETED
File without changes
codetravail.py DELETED
File without changes
downloadModels.py DELETED
File without changes
jurisprudence.py DELETED
File without changes
setup_vectorstore.py DELETED
File without changes
src/utils/jurisprudence.py DELETED
@@ -1,92 +0,0 @@
1
- import os
2
- import time
3
- from langchain.text_splitter import RecursiveCharacterTextSplitter
4
- from langchain_community.vectorstores import Chroma
5
- from langchain_community.embeddings import HuggingFaceEmbeddings
6
-
7
- # Paramètres
8
- CHUNK_SIZE = 500
9
- CHUNK_OVERLAP = 100
10
- DB_PATH = os.path.abspath("../../db") # Chemin racine du projet
11
- EMBEDDING_MODEL = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
12
- ARCHIVE_DIR = os.path.abspath("./data/archives_mails")
13
- JURIS_DIR = os.path.abspath("./data/jurisprudence")
14
-
15
- print("[INFO] Chargement des mails depuis :", ARCHIVE_DIR)
16
- mail_files = [f for f in os.listdir(ARCHIVE_DIR) if os.path.isfile(os.path.join(ARCHIVE_DIR, f))]
17
- print(f"[INFO] {len(mail_files)} fichiers trouvés.")
18
-
19
- print("[INFO] Chargement des décisions depuis :", JURIS_DIR)
20
- juris_files = [f for f in os.listdir(JURIS_DIR) if os.path.isfile(os.path.join(JURIS_DIR, f))]
21
- print(f"[INFO] {len(juris_files)} fichiers trouvés.")
22
-
23
- splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)
24
- documents = []
25
- metadatas = []
26
-
27
- for idx, filename in enumerate(mail_files):
28
- file_path = os.path.join(ARCHIVE_DIR, filename)
29
- try:
30
- with open(file_path, 'r', encoding='utf-8') as f:
31
- content = f.read()
32
- except Exception as e:
33
- print(f"[WARN] Impossible de lire {filename} : {e}")
34
- continue
35
- if not content.strip():
36
- print(f"[WARN] Fichier vide ignoré : {filename}")
37
- continue
38
- for chunk in splitter.split_text(content):
39
- documents.append(chunk)
40
- metadatas.append({
41
- "source": "archive_mail",
42
- "filename": filename
43
- })
44
- print(f"[INFO] {len(documents)} chunks générés à partir des mails.")
45
-
46
- for idx, filename in enumerate(juris_files):
47
- file_path = os.path.join(JURIS_DIR, filename)
48
- try:
49
- with open(file_path, 'r', encoding='utf-8') as f:
50
- content = f.read()
51
- except Exception as e:
52
- print(f"[WARN] Impossible de lire {filename} : {e}")
53
- continue
54
- if not content.strip():
55
- print(f"[WARN] Fichier vide ignoré : {filename}")
56
- continue
57
- for chunk in splitter.split_text(content):
58
- documents.append(chunk)
59
- metadatas.append({
60
- "source": "jurisprudence",
61
- "filename": filename
62
- })
63
- print(f"[INFO] {len(documents)} chunks générés à partir des décisions.")
64
-
65
- print(f"[INFO] Chargement des embeddings ({EMBEDDING_MODEL})...")
66
- embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
67
-
68
- # Charger ou créer la base Chroma existante
69
- if os.path.exists(DB_PATH):
70
- print(f"[INFO] Ouverture de la base vectorielle existante : {DB_PATH}")
71
- db = Chroma(persist_directory=DB_PATH, embedding_function=embeddings)
72
- else:
73
- print(f"[INFO] Création d'une nouvelle base vectorielle : {DB_PATH}")
74
- os.makedirs(DB_PATH, exist_ok=True)
75
- db = Chroma(persist_directory=DB_PATH, embedding_function=embeddings)
76
-
77
- # Ajout des nouveaux documents
78
- print("[INFO] Ajout des nouveaux mails et décisions à la base vectorielle...")
79
- t0 = time.time()
80
- db.add_texts(documents, metadatas=metadatas)
81
- db.persist()
82
- t1 = time.time()
83
- print(f"[SUCCESS] {len(documents)} chunks de mails et décisions ajoutés à la base vectorielle en {t1-t0:.1f} secondes.")
84
-
85
- # Affichage du total de documents dans la base
86
- try:
87
- total_docs = db._collection.count()
88
- print(f"[INFO] Total de documents dans la base vectorielle après ajout : {total_docs}")
89
- except Exception as e:
90
- print(f"[WARN] Impossible de compter le nombre total de documents : {e}")
91
-
92
- print(f"[INFO] La base vectorielle est prête dans : {DB_PATH}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/utils/setup_vectorstore.py DELETED
File without changes