Spaces:
Running
Running
Delete pinecone_utils.py
Browse files- pinecone_utils.py +0 -59
pinecone_utils.py
DELETED
|
@@ -1,59 +0,0 @@
|
|
| 1 |
-
from sentence_transformers import SentenceTransformer
|
| 2 |
-
from config import index
|
| 3 |
-
import zlib
|
| 4 |
-
import base64
|
| 5 |
-
|
| 6 |
-
def split_text_into_chunks(text, max_chunk_size=1024):
|
| 7 |
-
"""Divise le texte en morceaux de taille maximale spécifiée."""
|
| 8 |
-
return [text[i:i+max_chunk_size] for i in range(0, len(text), max_chunk_size)]
|
| 9 |
-
|
| 10 |
-
def decompress_text(compressed_text):
|
| 11 |
-
"""Decompress a compressed text string."""
|
| 12 |
-
decoded = base64.b64decode(compressed_text.encode('ascii'))
|
| 13 |
-
return zlib.decompress(decoded).decode('utf-8')
|
| 14 |
-
|
| 15 |
-
def compress_text(text):
|
| 16 |
-
"""Compresse le texte et le encode en base64."""
|
| 17 |
-
compressed = zlib.compress(text.encode('utf-8'))
|
| 18 |
-
return base64.b64encode(compressed).decode('ascii')
|
| 19 |
-
|
| 20 |
-
def get_metadata_size(metadata):
|
| 21 |
-
"""Calcule la taille des métadonnées en bytes."""
|
| 22 |
-
return len(str(metadata).encode('utf-8'))
|
| 23 |
-
|
| 24 |
-
def index_pdf(texts):
|
| 25 |
-
model = SentenceTransformer('intfloat/multilingual-e5-large')
|
| 26 |
-
vectors = model.encode(texts)
|
| 27 |
-
for i, (vector, chunk) in enumerate(zip(vectors, texts)):
|
| 28 |
-
# Diviser le texte en morceaux de 1024 caractères
|
| 29 |
-
chunks = split_text_into_chunks(chunk, max_chunk_size=1024)
|
| 30 |
-
for j, small_chunk in enumerate(chunks):
|
| 31 |
-
# Compresser le morceau
|
| 32 |
-
compressed_chunk = compress_text(small_chunk)
|
| 33 |
-
metadata = {"compressed_text": compressed_chunk}
|
| 34 |
-
|
| 35 |
-
# Vérifier la taille des métadonnées
|
| 36 |
-
metadata_size = get_metadata_size(metadata)
|
| 37 |
-
if metadata_size > 40960: # 40 KB
|
| 38 |
-
print(f"Attention : la taille des métadonnées ({metadata_size} bytes) dépasse la limite de 40960 bytes.")
|
| 39 |
-
# Réduire davantage la taille du morceau
|
| 40 |
-
small_chunk = small_chunk[:512] # Réduire à 512 caractères
|
| 41 |
-
compressed_chunk = compress_text(small_chunk)
|
| 42 |
-
metadata = {"compressed_text": compressed_chunk}
|
| 43 |
-
metadata_size = get_metadata_size(metadata)
|
| 44 |
-
if metadata_size > 40960:
|
| 45 |
-
print("Impossible de réduire suffisamment la taille des métadonnées. Ignorer ce morceau.")
|
| 46 |
-
continue
|
| 47 |
-
|
| 48 |
-
# Insérer dans Pinecone
|
| 49 |
-
index.upsert([(f"vec_{i}_{j}", vector.tolist(), metadata)])
|
| 50 |
-
|
| 51 |
-
def retrieve_documents(query):
|
| 52 |
-
model = SentenceTransformer('intfloat/multilingual-e5-large')
|
| 53 |
-
query_vector = model.encode([query]).tolist()[0]
|
| 54 |
-
results = index.query(vector=query_vector, top_k=5, include_metadata=True)
|
| 55 |
-
relevant_docs = []
|
| 56 |
-
for match in results.get("matches", []):
|
| 57 |
-
compressed_text = match["metadata"]["compressed_text"]
|
| 58 |
-
relevant_docs.append(decompress_text(compressed_text))
|
| 59 |
-
return relevant_docs
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|