AxL95 commited on
Commit
0a3b5e1
·
verified ·
1 Parent(s): fb4c43f

Update admin.py

Browse files
Files changed (1) hide show
  1. admin.py +83 -22
admin.py CHANGED
@@ -4,6 +4,8 @@ import os
4
  import PyPDF2
5
  from io import BytesIO
6
  from datetime import datetime
 
 
7
 
8
  from auth import get_admin_user
9
  from database import get_db
@@ -32,44 +34,93 @@ async def upload_pdf(
32
  for page_num in range(len(pdf_reader.pages)):
33
  text_content += pdf_reader.pages[page_num].extract_text() + "\n"
34
 
35
- embedding = None
36
- if embedding_model:
37
- try:
38
- max_length = 5000
39
- truncated_text = text_content[:max_length]
40
- embedding = embedding_model.embed_query(truncated_text)
41
- except Exception as e:
42
- print(f"Erreur: {str(e)}")
43
-
44
  doc_id = ObjectId()
45
 
46
  pdf_path = f"/tmp/{str(doc_id)}.pdf"
47
-
48
  with open(pdf_path, "wb") as f:
49
  pdf_file.seek(0)
50
  f.write(contents)
51
 
52
- document = {
 
 
 
 
 
 
 
53
  "_id": doc_id,
54
- "text": text_content,
55
- "embedding": embedding,
56
  "title": title or file.filename,
57
  "tags": tags.split(",") if tags else [],
58
  "uploaded_by": str(current_user["_id"]),
59
- "upload_date": datetime.utcnow()
 
 
 
60
  }
61
 
62
- print(f"Tentative d'insertion du document avec ID: {doc_id}")
63
- result = db.connaissances.insert_one(document)
64
- print(f"Document inséré avec ID: {result.inserted_id}")
65
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
  verification = db.connaissances.find_one({"_id": doc_id})
67
  if verification:
68
- print(f"Document vérifié et trouvé dans la base de données")
69
- return {"success": True, "document_id": str(doc_id)}
 
 
 
 
 
70
  else:
71
- print(f"ERREUR: Document non trouvé après insertion")
72
- return {"success": False, "error": "Document non trouvé après insertion"}
 
 
 
73
 
74
  except Exception as e:
75
  import traceback
@@ -109,6 +160,13 @@ async def delete_document(document_id: str, current_user: dict = Depends(get_adm
109
  if not document:
110
  raise HTTPException(status_code=404, detail="Document non trouvé")
111
 
 
 
 
 
 
 
 
112
  result = db.connaissances.delete_one({"_id": doc_id})
113
 
114
  if result.deleted_count == 0:
@@ -122,7 +180,10 @@ async def delete_document(document_id: str, current_user: dict = Depends(get_adm
122
  except Exception as e:
123
  print(f"Erreur lors de la suppression du fichier: {str(e)}")
124
 
125
- return {"success": True, "message": "Document supprimé avec succès"}
 
 
 
126
 
127
  except HTTPException as he:
128
  raise he
 
4
  import PyPDF2
5
  from io import BytesIO
6
  from datetime import datetime
7
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
8
+ from langchain.docstore.document import Document
9
 
10
  from auth import get_admin_user
11
  from database import get_db
 
34
  for page_num in range(len(pdf_reader.pages)):
35
  text_content += pdf_reader.pages[page_num].extract_text() + "\n"
36
 
 
 
 
 
 
 
 
 
 
37
  doc_id = ObjectId()
38
 
39
  pdf_path = f"/tmp/{str(doc_id)}.pdf"
40
+ os.makedirs("files", exist_ok=True)
41
  with open(pdf_path, "wb") as f:
42
  pdf_file.seek(0)
43
  f.write(contents)
44
 
45
+ print(f"Découpage du document '{title or file.filename}' en chunks...")
46
+ splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
47
+
48
+ doc = Document(page_content=text_content, metadata={"title": title or file.filename})
49
+ chunks = splitter.split_documents([doc])
50
+ print(f"{len(chunks)} morceaux extraits.")
51
+
52
+ main_document = {
53
  "_id": doc_id,
 
 
54
  "title": title or file.filename,
55
  "tags": tags.split(",") if tags else [],
56
  "uploaded_by": str(current_user["_id"]),
57
+ "upload_date": datetime.utcnow(),
58
+ "is_parent": True,
59
+ "chunk_count": len(chunks),
60
+ "file_path": pdf_path
61
  }
62
 
63
+ db.connaissances.insert_one(main_document)
 
 
64
 
65
+ inserted_chunks = 0
66
+ chunk_ids = []
67
+
68
+ for i, chunk in enumerate(chunks):
69
+ try:
70
+ chunk_text = chunk.page_content
71
+ if len(chunk_text) > 5000:
72
+ chunk_text = chunk_text[:5000]
73
+
74
+ embedding = None
75
+ if embedding_model:
76
+ try:
77
+ embedding = embedding_model.embed_query(chunk_text)
78
+ except Exception as e:
79
+ print(f"Erreur lors de la génération de l'embedding pour le morceau {i+1}: {str(e)}")
80
+
81
+ chunk_id = ObjectId()
82
+ chunk_doc = {
83
+ "_id": chunk_id,
84
+ "parent_id": doc_id,
85
+ "text": chunk_text,
86
+ "embedding": embedding,
87
+ "title": f"{title or file.filename} - Partie {i+1}",
88
+ "tags": tags.split(",") if tags else [],
89
+ "chunk_index": i,
90
+ "uploaded_by": str(current_user["_id"]),
91
+ "upload_date": datetime.utcnow(),
92
+ "is_chunk": True
93
+ }
94
+
95
+ db.connaissances.insert_one(chunk_doc)
96
+ chunk_ids.append(str(chunk_id))
97
+ inserted_chunks += 1
98
+
99
+ print(f"Morceau {i+1}/{len(chunks)} inséré.")
100
+ except Exception as chunk_error:
101
+ print(f"Erreur lors du traitement du morceau {i+1}: {str(chunk_error)}")
102
+
103
+ db.connaissances.update_one(
104
+ {"_id": doc_id},
105
+ {"$set": {"chunk_ids": chunk_ids, "inserted_chunks": inserted_chunks}}
106
+ )
107
+
108
+ # Vérification
109
  verification = db.connaissances.find_one({"_id": doc_id})
110
  if verification:
111
+ print(f"Document parent vérifié et trouvé dans la base de données avec {inserted_chunks} chunks")
112
+ return {
113
+ "success": True,
114
+ "document_id": str(doc_id),
115
+ "chunks_total": len(chunks),
116
+ "chunks_inserted": inserted_chunks
117
+ }
118
  else:
119
+ print(f"ERREUR: Document parent non trouvé après insertion")
120
+ return {
121
+ "success": False,
122
+ "error": "Document parent non trouvé après insertion"
123
+ }
124
 
125
  except Exception as e:
126
  import traceback
 
160
  if not document:
161
  raise HTTPException(status_code=404, detail="Document non trouvé")
162
 
163
+ chunks_deleted = 0
164
+ if document.get("is_parent", False):
165
+ # Supprimer tous les chunks liés à ce parent
166
+ chunks_result = db.connaissances.delete_many({"parent_id": doc_id})
167
+ chunks_deleted = chunks_result.deleted_count
168
+ print(f"Suppression de {chunks_deleted} chunks associés au document {document_id}")
169
+
170
  result = db.connaissances.delete_one({"_id": doc_id})
171
 
172
  if result.deleted_count == 0:
 
180
  except Exception as e:
181
  print(f"Erreur lors de la suppression du fichier: {str(e)}")
182
 
183
+ return {
184
+ "success": True,
185
+ "message": f"Document supprimé avec succès, ainsi que {chunks_deleted} chunks associés"
186
+ }
187
 
188
  except HTTPException as he:
189
  raise he