Spaces:
Sleeping
Sleeping
| # src/embed_service/app.py | |
| from fastapi import FastAPI | |
| from pydantic import BaseModel | |
| from src.embed_service.embedder import Embedder | |
| from src.embed_service.cache_manager import CacheManager | |
| import numpy as np | |
| app = FastAPI(title="Embed Service") | |
| embedder = Embedder() | |
| cache = CacheManager() | |
| class EmbedRequest(BaseModel): | |
| filename: str | |
| text: str | |
| hash: str | |
| def embed_document(req: EmbedRequest): | |
| if cache.exists(req.filename, req.hash): | |
| emb = cache.get_embedding(req.filename) | |
| return {"filename": req.filename, "cached": True, "embedding": emb.tolist()} | |
| emb = embedder.embed_text(req.text) | |
| cache.add_embedding(req.filename, req.hash, emb) | |
| return {"filename": req.filename, "cached": False, "embedding": emb.tolist()} | |
| class BatchEmbedRequest(BaseModel): | |
| docs: list | |
| def embed_batch(req: BatchEmbedRequest): | |
| results = [] | |
| new_texts, new_files, new_hashes = [], [], [] | |
| for d in req.docs: | |
| filename = d.get("filename") | |
| file_hash = d.get("hash") | |
| text = d.get("text") or d.get("clean_text") or "" | |
| if cache.exists(filename, file_hash): | |
| results.append({"filename": filename, "cached": True, "embedding": cache.get_embedding(filename).tolist()}) | |
| else: | |
| new_files.append(filename) | |
| new_hashes.append(file_hash) | |
| new_texts.append(text) | |
| if new_texts: | |
| new_embs = embedder.embed_batch(new_texts) | |
| for fname, h, emb in zip(new_files, new_hashes, new_embs): | |
| cache.add_embedding(fname, h, emb) | |
| results.append({"filename": fname, "cached": False, "embedding": emb.tolist()}) | |
| return {"count": len(results), "results": results} | |
| def get_all_embeddings(): | |
| meta, embs = cache.all_embeddings() | |
| return {"meta": meta, "embeddings": embs.tolist()} | |
| # convenience endpoint called earlier by older code | |
| def embed_all_docs(docs: list): | |
| # docs: list of {filename, clean_text, hash} | |
| batch = {"docs": [{"filename": d["filename"], "text": d.get("clean_text") or d.get("text", ""), "hash": d["hash"]} for d in docs]} | |
| return embed_batch(BatchEmbedRequest(**batch)) | |