|
|
|
|
|
|
|
|
|
|
|
import os |
|
|
import numpy as np |
|
|
from sentence_transformers import SentenceTransformer |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
CACHE_DIR = "/tmp/hf_cache" |
|
|
os.makedirs(CACHE_DIR, exist_ok=True) |
|
|
|
|
|
os.environ["HF_HOME"] = CACHE_DIR |
|
|
os.environ["TRANSFORMERS_CACHE"] = CACHE_DIR |
|
|
os.environ["HF_DATASETS_CACHE"] = CACHE_DIR |
|
|
os.environ["HF_MODULES_CACHE"] = CACHE_DIR |
|
|
|
|
|
print(f"β
Using Hugging Face cache at {CACHE_DIR}") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
try: |
|
|
_model = SentenceTransformer( |
|
|
"intfloat/e5-small-v2", |
|
|
cache_folder=CACHE_DIR |
|
|
) |
|
|
print("β
Loaded model: intfloat/e5-small-v2") |
|
|
except Exception as e: |
|
|
print(f"β οΈ Model load failed ({e}), falling back to MiniLM.") |
|
|
_model = SentenceTransformer( |
|
|
"sentence-transformers/all-MiniLM-L6-v2", |
|
|
cache_folder=CACHE_DIR |
|
|
) |
|
|
print("β
Loaded fallback model: all-MiniLM-L6-v2") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def generate_embeddings(chunks: list) -> list: |
|
|
""" |
|
|
π Generate normalized embeddings for a list of text chunks. |
|
|
|
|
|
Args: |
|
|
chunks (list): List of text chunks. |
|
|
|
|
|
Returns: |
|
|
list: List of normalized embedding vectors (Python lists). |
|
|
|
|
|
Notes: |
|
|
- Prefixing chunks with 'passage:' improves retrieval accuracy for E5. |
|
|
- normalize_embeddings=True ensures cosine-similarity consistency. |
|
|
- Works efficiently even for large PDFs. |
|
|
""" |
|
|
if not chunks: |
|
|
print("β οΈ No chunks provided for embedding generation.") |
|
|
return [] |
|
|
|
|
|
|
|
|
prepared_chunks = [f"passage: {chunk.strip()}" for chunk in chunks] |
|
|
|
|
|
|
|
|
vectors = _model.encode( |
|
|
prepared_chunks, |
|
|
convert_to_numpy=True, |
|
|
normalize_embeddings=True |
|
|
) |
|
|
|
|
|
|
|
|
embeddings = vectors.tolist() |
|
|
|
|
|
print(f"β
Generated {len(embeddings)} embeddings.") |
|
|
return embeddings |
|
|
|