Spaces:
Running
Running
| """ | |
| embedder.py — LawAgent Mursit Embedder (v10-Production) | |
| ---------------------------------------------------------- | |
| Değişiklikler v9 → v10: | |
| - encode_single() metodu eklendi (retrieval.py monkey-patch kaldırıldı) | |
| - Quantize path mantığı sadeleştirildi ve hata toleranslı hale getirildi | |
| - __init__ süresi print'i düzeltildi | |
| - Qdrant local/prod ayrimi QDRANT_URL env var'i ile yapiliyor | |
| """ | |
| import argparse | |
| import json | |
| import os | |
| import sys | |
| import time | |
| import uuid | |
| from pathlib import Path | |
| # services/ klasörü BACKEND/ kökünde — hangi dizinden çalıştırılırsa çalıştırılsın bulsun | |
| _BACKEND_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) | |
| if _BACKEND_DIR not in sys.path: | |
| sys.path.insert(0, _BACKEND_DIR) | |
| # .env dosyasını yükle (QDRANT_URL, GROQ_API_KEY vs. için) | |
| from dotenv import load_dotenv | |
| for _env_path in [ | |
| Path(_BACKEND_DIR) / ".env", | |
| Path(_BACKEND_DIR).parent / ".env", | |
| ]: | |
| if _env_path.exists(): | |
| load_dotenv(dotenv_path=_env_path) | |
| print(f"[Embedder] .env yüklendi: {_env_path}") | |
| break | |
| import torch | |
| from sentence_transformers import SentenceTransformer | |
| from qdrant_client import QdrantClient | |
| from qdrant_client.http import models as qmodels | |
| from services.qdrant_client import get_qdrant_client | |
| # ─── PATHS ──────────────────────────────────────────────────────────────────── | |
| BASE_DIR = os.path.dirname(os.path.abspath(__file__)) | |
| DATA_DIR = os.path.join(BASE_DIR, "data") | |
| MODEL_NAME = "newmindai/Mursit-Base-TR-Retrieval" | |
| COLLECTION_NAME = "lawagent_mursit" | |
| CHUNK_CORPUS = os.path.join(DATA_DIR, "chunk_corpus.json") | |
| QUANTIZE_PATH = os.path.join(DATA_DIR, "mursit_int8.pt") | |
| BATCH_SIZE = 32 | |
| DISTANCE_METRIC = qmodels.Distance.COSINE | |
| # ─── EMBEDDER ───────────────────────────────────────────────────────────────── | |
| class MursitEmbedder: | |
| """ | |
| Mursit-Base-TR-Retrieval için embedding sınıfı. | |
| quantize=False → float32 (~594 MB RAM, ~145 ms/sorgu) | |
| quantize=True → int8 (~173 MB RAM, ~94 ms/sorgu, %3 MRR kaybı) | |
| """ | |
| def __init__(self, quantize: bool = False): | |
| self.quantize = quantize | |
| self.device = "cpu" | |
| fmt = "int8" if quantize else "float32" | |
| print(f"[Mursit] Model yükleniyor ({fmt})...") | |
| t0 = time.time() | |
| self.st = SentenceTransformer(MODEL_NAME, device=self.device) | |
| self.vector_size = self.st.get_embedding_dimension() | |
| if quantize: | |
| self._load_or_quantize() | |
| print(f"[Mursit] Hazır — {time.time()-t0:.1f}s | dim={self.vector_size}") | |
| # ------------------------------------------------------------------ | |
| def _load_or_quantize(self) -> None: | |
| transformer_module = self.st._first_module().auto_model | |
| quantized = torch.quantization.quantize_dynamic( | |
| transformer_module, {torch.nn.Linear}, dtype=torch.qint8 | |
| ) | |
| if os.path.exists(QUANTIZE_PATH): | |
| print("[Mursit] Kaydedilmiş int8 ağırlıklar yükleniyor...") | |
| try: | |
| state = torch.load(QUANTIZE_PATH, map_location="cpu") | |
| quantized.load_state_dict(state) | |
| except Exception as e: | |
| print(f"[UYARI] Kaydedilmiş int8 yüklenemedi, sıfırdan quantize edildi: {e}") | |
| else: | |
| print("[Mursit] Quantize ediliyor (ilk kez)...") | |
| os.makedirs(os.path.dirname(QUANTIZE_PATH) or ".", exist_ok=True) | |
| torch.save(quantized.state_dict(), QUANTIZE_PATH) | |
| print(f"[Mursit] int8 kaydedildi → {QUANTIZE_PATH}") | |
| self.st._first_module().auto_model = quantized | |
| # ------------------------------------------------------------------ | |
| def encode(self, texts: list[str], normalize: bool = True) -> list: | |
| """Liste halinde metinleri vektöre çevirir → Python list of list[float].""" | |
| return self.st.encode( | |
| texts, | |
| normalize_embeddings=normalize, | |
| show_progress_bar=False, | |
| batch_size=BATCH_SIZE, | |
| ).tolist() | |
| def encode_single(self, text: str, normalize: bool = True) -> list[float]: | |
| """Sorguyu Mursit formatına uygun prefix ile vektöre çevirir.""" | |
| prefix = "query: " | |
| full_text = prefix + text.strip() # Boşlukları temizle | |
| return self.st.encode( | |
| full_text, | |
| normalize_embeddings=normalize, | |
| show_progress_bar=False, | |
| convert_to_numpy=True | |
| ).tolist() | |
| def kaydet(self, yol: str = QUANTIZE_PATH) -> None: | |
| if not self.quantize: | |
| print("[UYARI] float32 model kaydedilmiyor. --quantize ile çalıştır.") | |
| return | |
| os.makedirs(os.path.dirname(yol) or ".", exist_ok=True) | |
| torch.save(self.st._first_module().auto_model.state_dict(), yol) | |
| mb = os.path.getsize(yol) / 1024 / 1024 | |
| print(f"[Mursit] int8 kaydedildi → {yol} ({mb:.1f} MB)") | |
| # ─── QDRANT HELPERS ─────────────────────────────────────────────────────────── | |
| def _chunk_id_to_uint64(cid: str) -> int: | |
| return uuid.uuid5(uuid.NAMESPACE_DNS, str(cid)).int >> 64 | |
| def _get_existing_ids(client: QdrantClient, corpus: list) -> set: | |
| ids = [_chunk_id_to_uint64(c["chunk_id"]) for c in corpus] | |
| existing = set() | |
| for i in range(0, len(ids), 1000): | |
| try: | |
| points = client.retrieve( | |
| collection_name=COLLECTION_NAME, | |
| ids=ids[i : i + 1000], | |
| with_payload=False, | |
| with_vectors=False, | |
| ) | |
| existing.update(p.id for p in points) | |
| except Exception: | |
| pass | |
| return existing | |
| def _ensure_collection( | |
| client: QdrantClient, vector_size: int, reset: bool | |
| ) -> None: | |
| existing = [c.name for c in client.get_collections().collections] | |
| if reset and COLLECTION_NAME in existing: | |
| client.delete_collection(COLLECTION_NAME) | |
| print(f"[Qdrant] Collection silindi: {COLLECTION_NAME}") | |
| existing.remove(COLLECTION_NAME) | |
| if COLLECTION_NAME not in existing: | |
| client.create_collection( | |
| COLLECTION_NAME, | |
| vectors_config=qmodels.VectorParams( | |
| size=vector_size, distance=DISTANCE_METRIC | |
| ), | |
| ) | |
| print(f"[Qdrant] Collection oluşturuldu: {COLLECTION_NAME}") | |
| else: | |
| count = client.count(COLLECTION_NAME).count | |
| print(f"[Qdrant] Mevcut: {COLLECTION_NAME} ({count} kayıt)") | |
| # ─── EMBED CORPUS ───────────────────────────────────────────────────────────── | |
| def embed_corpus( | |
| reset: bool = False, | |
| test_mode: bool = False, | |
| quantize: bool = False, | |
| ) -> None: | |
| if not os.path.exists(CHUNK_CORPUS): | |
| print(f"[HATA] {CHUNK_CORPUS} bulunamadı. Önce legal_chunker.py çalıştır.") | |
| return | |
| with open(CHUNK_CORPUS, "r", encoding="utf-8") as f: | |
| corpus = json.load(f) | |
| if test_mode: | |
| corpus = corpus[:20] | |
| print("[Test] Sadece ilk 20 chunk işlenecek.") | |
| embedder = MursitEmbedder(quantize=quantize) | |
| client = get_qdrant_client() | |
| _ensure_collection(client, embedder.vector_size, reset) | |
| existing = set() if reset else _get_existing_ids(client, corpus) | |
| yeni = [c for c in corpus if _chunk_id_to_uint64(c["chunk_id"]) not in existing] | |
| print( | |
| f"[Embed] Toplam={len(corpus)} | Mevcut={len(existing)} | Eklenecek={len(yeni)}" | |
| ) | |
| if not yeni: | |
| print("[Embed] Her şey güncel, işlem yok.") | |
| return | |
| t0, eklenen, toplam = time.time(), 0, len(yeni) | |
| for i in range(0, toplam, BATCH_SIZE): | |
| batch = yeni[i : i + BATCH_SIZE] | |
| enriched_texts = [] | |
| for c in batch: | |
| source = c.get("source", "") | |
| text = c.get("text","") | |
| if source == "yargitay": | |
| enriched_texts.append(f"Yargıtay Kararı {c.get('decision_id', '')}: {text}") | |
| else: | |
| #Mevzuat için Kanun + Madde + Text | |
| enriched_texts.append(f"{c.get('law','')} Madde {c.get('article_no', '')}: {text}") | |
| vecs = embedder.encode(enriched_texts) | |
| points = [ | |
| qmodels.PointStruct( | |
| id=_chunk_id_to_uint64(c["chunk_id"]), | |
| vector=vecs[idx], | |
| payload={ | |
| "chunk_id": c.get("chunk_id", ""), | |
| "text": c.get("text", ""), | |
| "law": c.get("law", ""), | |
| "article_no": c.get("article_no", ""), | |
| "source": c.get("source", ""), | |
| "decision_id":c.get("decision_id", ""), | |
| "token_len": c.get("token_len", 0), | |
| "atiflar": c.get("atiflar", []) #mevzuat-içtihat köprüsü | |
| }, | |
| ) | |
| for idx, c in enumerate(batch) | |
| ] | |
| client.upsert(collection_name=COLLECTION_NAME, points=points) | |
| eklenen += len(batch) | |
| elapsed = time.time() - t0 | |
| remaining = (elapsed / eklenen) * (toplam - eklenen) if eklenen else 0 | |
| print( | |
| f" {eklenen:4d}/{toplam} (%{eklenen/toplam*100:.0f})" | |
| f" | ~{remaining/60:.1f} dk kaldı" | |
| ) | |
| print(f"\n[Embed] Tamamlandı. {eklenen} chunk eklendi. ({time.time()-t0:.1f}s)") | |
| # ─── CLI ────────────────────────────────────────────────────────────────────── | |
| if __name__ == "__main__": | |
| parser = argparse.ArgumentParser(description="LawAgent Mursit Embedder v10") | |
| parser.add_argument("--reset", action="store_true", help="Collection'ı sıfırla") | |
| parser.add_argument("--test", action="store_true", help="İlk 20 chunk") | |
| parser.add_argument("--quantize", action="store_true", help="int8 quantization") | |
| parser.add_argument("--kaydet", action="store_true", help="Quantized modeli kaydet") | |
| args = parser.parse_args() | |
| if args.quantize and args.kaydet: | |
| embedder = MursitEmbedder(quantize=True) | |
| embedder.kaydet() | |
| else: | |
| embed_corpus(reset=args.reset, test_mode=args.test, quantize=args.quantize) | |