Spaces:
Sleeping
Sleeping
| from fastapi import FastAPI, UploadFile, File, Form, HTTPException | |
| from fastapi.responses import JSONResponse | |
| from pydantic import BaseModel | |
| from typing import Optional, List | |
| from PIL import Image | |
| import io | |
| import numpy as np | |
| from embedding_service import JinaClipEmbeddingService | |
| from qdrant_service import QdrantVectorService | |
| # Initialize FastAPI app | |
| app = FastAPI( | |
| title="Event Social Media Embeddings API", | |
| description="API để embeddings và search text + images từ events & social media với Jina CLIP v2 + Qdrant", | |
| version="1.0.0" | |
| ) | |
| # Initialize services | |
| print("Initializing services...") | |
| embedding_service = JinaClipEmbeddingService(model_path="jinaai/jina-clip-v2") | |
| qdrant_service = QdrantVectorService( | |
| # URL và API key sẽ lấy từ environment variables | |
| collection_name="event_social_media", | |
| vector_size=embedding_service.get_embedding_dimension() | |
| ) | |
| print("✓ Services initialized successfully") | |
| # Pydantic models | |
| class SearchRequest(BaseModel): | |
| text: Optional[str] = None | |
| limit: int = 10 | |
| score_threshold: Optional[float] = None | |
| text_weight: float = 0.5 | |
| image_weight: float = 0.5 | |
| class SearchResponse(BaseModel): | |
| id: str | |
| confidence: float | |
| metadata: dict | |
| class IndexResponse(BaseModel): | |
| success: bool | |
| id: str | |
| message: str | |
| async def root(): | |
| """Health check endpoint""" | |
| return { | |
| "status": "running", | |
| "service": "Event Social Media Embeddings API", | |
| "embedding_model": "Jina CLIP v2", | |
| "vector_db": "Qdrant", | |
| "language_support": "Vietnamese + 88 other languages" | |
| } | |
| async def index_data( | |
| id: str = Form(...), | |
| text: str = Form(...), | |
| image: Optional[UploadFile] = File(None) | |
| ): | |
| """ | |
| Index data vào vector database | |
| Body: | |
| - id: Document ID (event ID, post ID, etc.) | |
| - text: Text content (tiếng Việt supported) | |
| - image: Image file (optional) | |
| Returns: | |
| - success: True/False | |
| - id: Document ID | |
| - message: Status message | |
| """ | |
| try: | |
| # Prepare embeddings | |
| text_embedding = None | |
| image_embedding = None | |
| # Encode text (tiếng Việt) | |
| if text and text.strip(): | |
| text_embedding = embedding_service.encode_text(text) | |
| # Encode image nếu có | |
| if image: | |
| image_bytes = await image.read() | |
| pil_image = Image.open(io.BytesIO(image_bytes)).convert('RGB') | |
| image_embedding = embedding_service.encode_image(pil_image) | |
| # Combine embeddings | |
| if text_embedding is not None and image_embedding is not None: | |
| # Average của text và image embeddings | |
| combined_embedding = np.mean([text_embedding, image_embedding], axis=0) | |
| elif text_embedding is not None: | |
| combined_embedding = text_embedding | |
| elif image_embedding is not None: | |
| combined_embedding = image_embedding | |
| else: | |
| raise HTTPException(status_code=400, detail="Phải cung cấp ít nhất text hoặc image") | |
| # Normalize | |
| combined_embedding = combined_embedding / np.linalg.norm(combined_embedding, axis=1, keepdims=True) | |
| # Index vào Qdrant | |
| metadata = { | |
| "text": text, | |
| "has_image": image is not None, | |
| "image_filename": image.filename if image else None | |
| } | |
| result = qdrant_service.index_data( | |
| doc_id=id, | |
| embedding=combined_embedding, | |
| metadata=metadata | |
| ) | |
| return IndexResponse( | |
| success=True, | |
| id=result["original_id"], # Trả về MongoDB ObjectId | |
| message=f"Đã index thành công document {result['original_id']} (Qdrant UUID: {result['qdrant_id']})" | |
| ) | |
| except Exception as e: | |
| raise HTTPException(status_code=500, detail=f"Lỗi khi index: {str(e)}") | |
| async def search( | |
| text: Optional[str] = Form(None), | |
| image: Optional[UploadFile] = File(None), | |
| limit: int = Form(10), | |
| score_threshold: Optional[float] = Form(None), | |
| text_weight: float = Form(0.5), | |
| image_weight: float = Form(0.5) | |
| ): | |
| """ | |
| Search similar documents bằng text và/hoặc image | |
| Body: | |
| - text: Query text (tiếng Việt supported) | |
| - image: Query image (optional) | |
| - limit: Số lượng kết quả (default: 10) | |
| - score_threshold: Minimum confidence score (0-1) | |
| - text_weight: Weight cho text search (default: 0.5) | |
| - image_weight: Weight cho image search (default: 0.5) | |
| Returns: | |
| - List of results với id, confidence, và metadata | |
| """ | |
| try: | |
| # Prepare query embeddings | |
| text_embedding = None | |
| image_embedding = None | |
| # Encode text query | |
| if text and text.strip(): | |
| text_embedding = embedding_service.encode_text(text) | |
| # Encode image query | |
| if image: | |
| image_bytes = await image.read() | |
| pil_image = Image.open(io.BytesIO(image_bytes)).convert('RGB') | |
| image_embedding = embedding_service.encode_image(pil_image) | |
| # Validate input | |
| if text_embedding is None and image_embedding is None: | |
| raise HTTPException(status_code=400, detail="Phải cung cấp ít nhất text hoặc image để search") | |
| # Hybrid search với Qdrant | |
| results = qdrant_service.hybrid_search( | |
| text_embedding=text_embedding, | |
| image_embedding=image_embedding, | |
| text_weight=text_weight, | |
| image_weight=image_weight, | |
| limit=limit, | |
| score_threshold=score_threshold, | |
| ef=256 # High accuracy search | |
| ) | |
| # Format response | |
| return [ | |
| SearchResponse( | |
| id=result["id"], | |
| confidence=result["confidence"], | |
| metadata=result["metadata"] | |
| ) | |
| for result in results | |
| ] | |
| except Exception as e: | |
| raise HTTPException(status_code=500, detail=f"Lỗi khi search: {str(e)}") | |
| async def search_by_text( | |
| text: str = Form(...), | |
| limit: int = Form(10), | |
| score_threshold: Optional[float] = Form(None) | |
| ): | |
| """ | |
| Search chỉ bằng text (tiếng Việt) | |
| Body: | |
| - text: Query text (tiếng Việt) | |
| - limit: Số lượng kết quả | |
| - score_threshold: Minimum confidence score | |
| Returns: | |
| - List of results | |
| """ | |
| try: | |
| # Encode text | |
| text_embedding = embedding_service.encode_text(text) | |
| # Search | |
| results = qdrant_service.search( | |
| query_embedding=text_embedding, | |
| limit=limit, | |
| score_threshold=score_threshold, | |
| ef=256 | |
| ) | |
| return [ | |
| SearchResponse( | |
| id=result["id"], | |
| confidence=result["confidence"], | |
| metadata=result["metadata"] | |
| ) | |
| for result in results | |
| ] | |
| except Exception as e: | |
| raise HTTPException(status_code=500, detail=f"Lỗi khi search: {str(e)}") | |
| async def search_by_image( | |
| image: UploadFile = File(...), | |
| limit: int = Form(10), | |
| score_threshold: Optional[float] = Form(None) | |
| ): | |
| """ | |
| Search chỉ bằng image | |
| Body: | |
| - image: Query image | |
| - limit: Số lượng kết quả | |
| - score_threshold: Minimum confidence score | |
| Returns: | |
| - List of results | |
| """ | |
| try: | |
| # Encode image | |
| image_bytes = await image.read() | |
| pil_image = Image.open(io.BytesIO(image_bytes)).convert('RGB') | |
| image_embedding = embedding_service.encode_image(pil_image) | |
| # Search | |
| results = qdrant_service.search( | |
| query_embedding=image_embedding, | |
| limit=limit, | |
| score_threshold=score_threshold, | |
| ef=256 | |
| ) | |
| return [ | |
| SearchResponse( | |
| id=result["id"], | |
| confidence=result["confidence"], | |
| metadata=result["metadata"] | |
| ) | |
| for result in results | |
| ] | |
| except Exception as e: | |
| raise HTTPException(status_code=500, detail=f"Lỗi khi search: {str(e)}") | |
| async def delete_document(doc_id: str): | |
| """ | |
| Delete document by ID (MongoDB ObjectId hoặc UUID) | |
| Args: | |
| - doc_id: Document ID to delete | |
| Returns: | |
| - Success message | |
| """ | |
| try: | |
| qdrant_service.delete_by_id(doc_id) | |
| return {"success": True, "message": f"Đã xóa document {doc_id}"} | |
| except Exception as e: | |
| raise HTTPException(status_code=500, detail=f"Lỗi khi xóa: {str(e)}") | |
| async def get_document(doc_id: str): | |
| """ | |
| Get document by ID (MongoDB ObjectId hoặc UUID) | |
| Args: | |
| - doc_id: Document ID (MongoDB ObjectId) | |
| Returns: | |
| - Document data | |
| """ | |
| try: | |
| doc = qdrant_service.get_by_id(doc_id) | |
| if doc: | |
| return { | |
| "success": True, | |
| "data": doc | |
| } | |
| raise HTTPException(status_code=404, detail=f"Không tìm thấy document {doc_id}") | |
| except HTTPException: | |
| raise | |
| except Exception as e: | |
| raise HTTPException(status_code=500, detail=f"Lỗi khi get document: {str(e)}") | |
| async def get_stats(): | |
| """ | |
| Lấy thông tin thống kê collection | |
| Returns: | |
| - Collection statistics | |
| """ | |
| try: | |
| info = qdrant_service.get_collection_info() | |
| return info | |
| except Exception as e: | |
| raise HTTPException(status_code=500, detail=f"Lỗi khi lấy stats: {str(e)}") | |
| if __name__ == "__main__": | |
| import uvicorn | |
| uvicorn.run( | |
| app, | |
| host="0.0.0.0", | |
| port=8000, | |
| log_level="info" | |
| ) | |