File size: 2,469 Bytes
3a56dbd 4953cc0 6403c55 4953cc0 3a56dbd aa8b614 83f6641 6e9d1d0 4953cc0 6e9d1d0 d5c19a2 8ee2155 83f6641 3a56dbd 6403c55 12d1bb1 6403c55 12d1bb1 6403c55 6e9d1d0 83f6641 3a56dbd 83f6641 6e9d1d0 83f6641 3a56dbd 83f6641 3a56dbd 6403c55 3a56dbd 6403c55 3a56dbd 6403c55 3a56dbd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 |
# ==========================================================
# π embeddings.py β optimized for Hugging Face + FAISS + E5
# ==========================================================
import os
import numpy as np
from sentence_transformers import SentenceTransformer
# ----------------------------
# Hugging Face Cache Bootstrap
# ----------------------------
CACHE_DIR = "/tmp/hf_cache"
os.makedirs(CACHE_DIR, exist_ok=True)
os.environ["HF_HOME"] = CACHE_DIR
os.environ["TRANSFORMERS_CACHE"] = CACHE_DIR
os.environ["HF_DATASETS_CACHE"] = CACHE_DIR
os.environ["HF_MODULES_CACHE"] = CACHE_DIR
print(f"β
Using Hugging Face cache at {CACHE_DIR}")
# ----------------------------
# Load Embedding Model (E5 with fallback)
# ----------------------------
try:
_model = SentenceTransformer(
"intfloat/e5-small-v2", # β
Trained for retrieval-augmented QA
cache_folder=CACHE_DIR
)
print("β
Loaded model: intfloat/e5-small-v2")
except Exception as e:
print(f"β οΈ Model load failed ({e}), falling back to MiniLM.")
_model = SentenceTransformer(
"sentence-transformers/all-MiniLM-L6-v2",
cache_folder=CACHE_DIR
)
print("β
Loaded fallback model: all-MiniLM-L6-v2")
# ----------------------------
# Function: Generate Embeddings
# ----------------------------
def generate_embeddings(chunks: list) -> list:
"""
π Generate normalized embeddings for a list of text chunks.
Args:
chunks (list): List of text chunks.
Returns:
list: List of normalized embedding vectors (Python lists).
Notes:
- Prefixing chunks with 'passage:' improves retrieval accuracy for E5.
- normalize_embeddings=True ensures cosine-similarity consistency.
- Works efficiently even for large PDFs.
"""
if not chunks:
print("β οΈ No chunks provided for embedding generation.")
return []
# Step 1: Prefix each chunk for semantic clarity (per E5 training)
prepared_chunks = [f"passage: {chunk.strip()}" for chunk in chunks]
# Step 2: Encode with normalization for cosine similarity
vectors = _model.encode(
prepared_chunks,
convert_to_numpy=True,
normalize_embeddings=True # β
Makes FAISS IndexFlatIP accurate
)
# Step 3: Convert to Python list for FAISS / JSON compatibility
embeddings = vectors.tolist()
print(f"β
Generated {len(embeddings)} embeddings.")
return embeddings
|