Sathvik-kota's picture
Upload folder using huggingface_hub
1f9de36 verified
# src/embed_service/app.py
from fastapi import FastAPI
from pydantic import BaseModel
from src.embed_service.embedder import Embedder
from src.embed_service.cache_manager import CacheManager
import numpy as np
app = FastAPI(title="Embed Service")
embedder = Embedder()
cache = CacheManager()
class EmbedRequest(BaseModel):
filename: str
text: str
hash: str
@app.post("/embed_document")
def embed_document(req: EmbedRequest):
if cache.exists(req.filename, req.hash):
emb = cache.get_embedding(req.filename)
return {"filename": req.filename, "cached": True, "embedding": emb.tolist()}
emb = embedder.embed_text(req.text)
cache.add_embedding(req.filename, req.hash, emb)
return {"filename": req.filename, "cached": False, "embedding": emb.tolist()}
class BatchEmbedRequest(BaseModel):
docs: list
@app.post("/embed_batch")
def embed_batch(req: BatchEmbedRequest):
results = []
new_texts, new_files, new_hashes = [], [], []
for d in req.docs:
filename = d.get("filename")
file_hash = d.get("hash")
text = d.get("text") or d.get("clean_text") or ""
if cache.exists(filename, file_hash):
results.append({"filename": filename, "cached": True, "embedding": cache.get_embedding(filename).tolist()})
else:
new_files.append(filename)
new_hashes.append(file_hash)
new_texts.append(text)
if new_texts:
new_embs = embedder.embed_batch(new_texts)
for fname, h, emb in zip(new_files, new_hashes, new_embs):
cache.add_embedding(fname, h, emb)
results.append({"filename": fname, "cached": False, "embedding": emb.tolist()})
return {"count": len(results), "results": results}
@app.get("/all_embeddings")
def get_all_embeddings():
meta, embs = cache.all_embeddings()
return {"meta": meta, "embeddings": embs.tolist()}
# convenience endpoint called earlier by older code
@app.post("/embed_all")
def embed_all_docs(docs: list):
# docs: list of {filename, clean_text, hash}
batch = {"docs": [{"filename": d["filename"], "text": d.get("clean_text") or d.get("text", ""), "hash": d["hash"]} for d in docs]}
return embed_batch(BatchEmbedRequest(**batch))