""" LIFT UP Taksonomi Sınıflandırıcı — Backend API Hugging Face Spaces (Docker) üzerinde çalışır. Endpoint: POST /classify { "baslik": "Proje başlığı", "ozet": "Proje özeti", "keywords": ["opsiyonel", "liste"] # opsiyonel } → { "prediction": "Kompozit Yapılar", "confidence": 0.82, "top_3": [...], "extracted_keywords": [...], "processing_time_ms": 1240 } """ import os import re import time import unicodedata import logging from contextlib import asynccontextmanager from collections import Counter from dataclasses import dataclass, field from typing import Dict, List, Optional, Set import numpy as np import torch import torch.nn as nn from fastapi import FastAPI, HTTPException from fastapi.middleware.cors import CORSMiddleware from pydantic import BaseModel from transformers import AutoTokenizer, AutoModel, AutoModelForSeq2SeqLM from sentence_transformers import SentenceTransformer from keybert import KeyBERT from huggingface_hub import hf_hub_download, snapshot_download logging.basicConfig(level=logging.INFO) log = logging.getLogger("liftup") HF_USERNAME = os.getenv("HF_USERNAME", "Engin34") HF_TOKEN = os.getenv("HF_TOKEN", "") # Space secret olarak eklenecek # ─── Global model değişkenleri ─────────────────────────────────────── bert_model = None bert_tok = None kw_model = None generator = None clf = None TOP_KEYWORDS = None device = torch.device("cpu") # ═══════════════════════════════════════════════════════════════════ # TAKSONOMİ PARSER # ═══════════════════════════════════════════════════════════════════ def _temizle(k): k = k.replace('\u200b','').replace('\ufeff','') k = unicodedata.normalize('NFKC', k) return re.sub(r'\s+',' ', k).strip().lower() def _parantez_ayir(k): m = re.match(r'^(.+?)\s*\((.+?)\)\s*$', k) if not m: return [k] ana, ic = m.group(1).strip(), m.group(2).strip() if any(a in ic.lower() for a in ['bağlam','kısmı','tarafı','proses','analiz','anahtarları','servisleme']): return [ana] return [ana] + [p.strip() for p in ic.split('/') if p.strip()] def _virgul_ayir(metin): sonuc, buf, d = [], [], 0 for c in metin: if c == '(': d += 1; buf.append(c) elif c == ')': d -= 1; buf.append(c) elif c == ',' and d == 0: sonuc.append(''.join(buf)); buf = [] else: buf.append(c) if buf: sonuc.append(''.join(buf)) return sonuc def parse_taksonomi(icerik: str) -> Dict: icerik = icerik.replace('\u200b','') matches = list(re.finditer(r'^\s*(\d+)\)\s+(.+?)\s*$', icerik, re.MULTILINE)) tax = {} for i, m in enumerate(matches): kat = m.group(2).strip() govde = icerik[m.end():(matches[i+1].start() if i+1 < len(matches) else len(icerik))].strip() pm = re.search(r'\((.+)\)', govde, re.DOTALL) if not pm: continue kw_set = set() for parca in _virgul_ayir(pm.group(1)): for alt in _parantez_ayir(parca.strip()): for k in re.split(r'[/]', alt): temiz = _temizle(k) if len(temiz) >= 2: kw_set.add(temiz) tax[kat] = {'keywords': kw_set} return tax # ═══════════════════════════════════════════════════════════════════ # HİBRİT SINIFLANDIRICI # ═══════════════════════════════════════════════════════════════════ @dataclass class EslesmeBilgisi: keyword: str; eslesme_tipi: str; eslesen_taksonomi_kw: str; puan: float @dataclass class KategoriSkoru: kategori: str; final_skor: float; keyword_skor: float; semantic_skor: float eslesmeler: list = field(default_factory=list) class HibritSiniflandirici: def __init__(self, taxonomy, embedder, keyword_weight=0.4, semantic_weight=0.6): self.taxonomy = {c: {'keywords':{str(k).lower().strip() for k in d.get('keywords',set()) if str(k).strip()}} for c,d in taxonomy.items()} self.kw_w, self.sem_w = keyword_weight, semantic_weight self.embedder = embedder log.info("Centroid'ler hesaplanıyor...") self.centroids = self._centroids() self.idf = self._idf() log.info(f"Hazır: {len(self.taxonomy)} kategori") def _centroids(self): c = {} for cat, d in self.taxonomy.items(): kws = list(d['keywords']) if not kws: c[cat]=None; continue embs = self.embedder.encode(kws, show_progress_bar=False, convert_to_numpy=True) v = np.mean(embs, axis=0); n = np.linalg.norm(v) c[cat] = v/n if n>0 else v return c def _idf(self): cnt = Counter() for d in self.taxonomy.values(): for k in d['keywords']: cnt[k]+=1 N = len(self.taxonomy) return {k: np.log(N/v)+1.0 for k,v in cnt.items()} def _kw_score(self, extracted): ext = [k.lower().strip() for k in extracted if k and str(k).strip()] max_idf = max(self.idf.values(), default=1.0) results = {} for cat, d in self.taxonomy.items(): cat_kws = d['keywords']; score, eslm = 0.0, [] for kw in ext: idf_w = self.idf.get(kw, 1.0) if kw in cat_kws: p=2.0*idf_w; score+=p; eslm.append(EslesmeBilgisi(kw,'exact',kw,p)); continue if len(kw)<4: continue for ck in cat_kws: if len(ck)>=4 and (kw in ck or ck in kw): p=1.0*idf_w; score+=p; eslm.append(EslesmeBilgisi(kw,'partial',ck,p)); break max_p = max(len(ext)*2.0*max_idf, 1e-6) results[cat] = (min(score/max_p,1.0), eslm) return results def _sem_score(self, extracted, text=None): parts = [] if text and str(text).strip(): parts.append(str(text).strip()) if extracted: parts.append(" ".join(extracted)) if not parts: return {c:0.0 for c in self.taxonomy} emb = self.embedder.encode([" | ".join(parts)], show_progress_bar=False, convert_to_numpy=True)[0] n = np.linalg.norm(emb) if n>0: emb=emb/n return {c: max(0.0, min(1.0,(float(np.dot(emb,cn))+1.0)/2.0)) if cn is not None else 0.0 for c,cn in self.centroids.items()} def classify(self, keywords, text=None, top_k=3): kw_r = self._kw_score(keywords) sem_s = self._sem_score(keywords, text) ks = {} for c in self.taxonomy: kwn, esl = kw_r[c] f = self.kw_w*kwn + self.sem_w*sem_s[c] ks[c] = KategoriSkoru(c, f, kwn, sem_s[c], esl) srt = sorted(ks.values(), key=lambda x: x.final_skor, reverse=True) return {'prediction': srt[0].kategori, 'confidence': srt[0].final_skor, 'top_k': srt[:top_k]} # ═══════════════════════════════════════════════════════════════════ # BERT MODEL # ═══════════════════════════════════════════════════════════════════ class LiftUpBertModel(nn.Module): def __init__(self, num_labels=128): super().__init__() self.bert = AutoModel.from_pretrained("dbmdz/bert-base-turkish-cased") self.dropout = nn.Dropout(0.3) self.classifier = nn.Linear(768, num_labels) def forward(self, input_ids, attention_mask): out = self.bert(input_ids=input_ids, attention_mask=attention_mask) return self.classifier(self.dropout(out.last_hidden_state[:,0])) # ═══════════════════════════════════════════════════════════════════ # POST-PROCESSOR # ═══════════════════════════════════════════════════════════════════ class SoftPostProcessor: def __init__(self): self.blacklist = {'kombinatür','hesonomik','modülasyonları','difüzörlü','optimizasyonlarını'} self.acronyms = {'CFD','FEA','CAD','ROS','CNN','AI','ML','DL','IoT','GPU','SSD'} def is_acronym(self,w): return w.isupper() and 2<=len(w)<=5 def fix_case(self,kw): out=[] for w in kw.split(): if w.upper() in self.acronyms or self.is_acronym(w): out.append(w.upper()) elif not out: out.append(w.capitalize()) else: out.append(w.lower()) return ' '.join(out) def should_filter(self,kw): if kw.lower() in self.blacklist: return True if not(3<=len(kw)<=80): return True if re.search(r'[^a-zA-ZçÇğĞıİöÖşŞüÜ\s\-]',kw): return True return False def process(self,keywords,min_kw=3): processed=[] for kw in keywords: if self.should_filter(kw): continue fixed=self.fix_case(kw) if not any(p.lower()==fixed.lower() for p in processed): processed.append(fixed) return processed[:8] if processed else keywords[:3] # ═══════════════════════════════════════════════════════════════════ # MODEL YÜKLEME (startup) # ═══════════════════════════════════════════════════════════════════ def load_models(): global bert_model, bert_tok, kw_model, generator, clf, TOP_KEYWORDS auth = {"token": HF_TOKEN} if HF_TOKEN else {} log.info("Modeller yükleniyor...") # 1) taksonomi log.info("Taksonomi indiriliyor...") tax_path = hf_hub_download( repo_id=f"{HF_USERNAME}/liftup-bert", filename="taksonomi.txt", **auth ) with open(tax_path, encoding='utf-8') as f: taxonomy = parse_taksonomi(f.read()) # 2) BERT checkpoint (TOP_KEYWORDS için) log.info("BERT checkpoint indiriliyor...") ckpt_path = hf_hub_download( repo_id=f"{HF_USERNAME}/liftup-bert", filename="checkpoint.pth", **auth ) ckpt = torch.load(ckpt_path, map_location="cpu") TOP_KEYWORDS = ckpt["TOP_KEYWORDS"] # 3) BERT model ağırlıkları log.info("BERT model ağırlıkları indiriliyor (422 MB)...") bert_path = hf_hub_download( repo_id=f"{HF_USERNAME}/liftup-bert", filename="best_bert_model.pth", **auth ) bert_tok = AutoTokenizer.from_pretrained("dbmdz/bert-base-turkish-cased") model = LiftUpBertModel(len(TOP_KEYWORDS)) model.load_state_dict(torch.load(bert_path, map_location="cpu")) model.eval() bert_model = model # 4) KeyBERT (aynı zamanda hibrit sınıflandırıcının embedder'ı) log.info("KeyBERT yükleniyor...") kw_model = KeyBERT(model='sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2') embedder = kw_model.model.embedding_model # 5) ByT5 log.info("ByT5 indiriliyor (1.1 GB)...") byt5_dir = snapshot_download( repo_id=f"{HF_USERNAME}/liftup-byt5", **auth ) byt5_tok = AutoTokenizer.from_pretrained("google/byt5-small") byt5_mdl = AutoModelForSeq2SeqLM.from_pretrained(byt5_dir) byt5_mdl.eval() post = SoftPostProcessor() class Generator: def __init__(self, tok, mdl, pp): self.tok, self.mdl, self.pp = tok, mdl, pp def generate(self, title="", abstract=""): text = f"keywords: {title} {abstract}".strip() inp = self.tok(text, max_length=512, truncation=True, return_tensors="pt") with torch.no_grad(): out = self.mdl.generate(**inp, max_new_tokens=128, do_sample=False, no_repeat_ngram_size=4, repetition_penalty=1.5) pred = self.tok.decode(out[0], skip_special_tokens=True) if pred.lower().startswith("keywords:"): pred=pred[9:].strip() kws = [k.strip() for k in pred.split(';') if k.strip()] return self.pp.process(kws) generator = Generator(byt5_tok, byt5_mdl, post) # 6) Hibrit sınıflandırıcı log.info("Hibrit sınıflandırıcı başlatılıyor...") clf = HibritSiniflandirici(taxonomy, embedder) log.info("✅ Tüm modeller hazır!") # ═══════════════════════════════════════════════════════════════════ # FASTAPI # ═══════════════════════════════════════════════════════════════════ @asynccontextmanager async def lifespan(app: FastAPI): load_models() yield app = FastAPI(title="LIFT UP Sınıflandırıcı", lifespan=lifespan) app.add_middleware( CORSMiddleware, allow_origins=["*"], allow_methods=["POST","GET"], allow_headers=["*"], ) class ClassifyRequest(BaseModel): baslik: str ozet: str keywords: Optional[List[str]] = None class KategoriResponse(BaseModel): kategori: str guven: float keyword_skor: float semantic_skor: float eslesmeler: List[str] class ClassifyResponse(BaseModel): prediction: str confidence: float top_3: List[KategoriResponse] extracted_keywords: List[str] processing_time_ms: int def bert_extract(text): enc = bert_tok(str(text).lower(), truncation=True, padding='max_length', max_length=256, return_tensors='pt') with torch.no_grad(): logits = bert_model(enc['input_ids'], enc['attention_mask']) probs = torch.sigmoid(logits)[0].numpy() idxs = np.argsort(probs)[-10:][::-1] return [TOP_KEYWORDS[i] for i in idxs if probs[i]>0.01][:5] def keybert_extract(text): clean = re.sub(r'[^\w\sğüşıöçĞÜŞİÖÇ]',' ', text.lower()).strip() try: kws = kw_model.extract_keywords(clean, keyphrase_ngram_range=(1,3), top_n=5, use_mmr=True, diversity=0.2) return [k[0] for k in kws][:3] except: return [] @app.get("/health") def health(): return {"status": "ok"} @app.get("/") def root(): return {"message": "LIFT UP API çalışıyor", "endpoint": "POST /classify"} @app.post("/classify", response_model=ClassifyResponse) def classify(req: ClassifyRequest): if not req.baslik.strip() or not req.ozet.strip(): raise HTTPException(400, "Başlık ve özet zorunludur") t0 = time.time() text = f"{req.baslik} {req.ozet}" # Keyword extraction bert_kws = bert_extract(text) kb_kws = keybert_extract(text) byt5_kws = generator.generate(req.baslik, req.ozet) # Kullanıcı keyword'leri varsa ekle extra = req.keywords or [] tum_kws = list(dict.fromkeys(bert_kws + kb_kws + byt5_kws + extra)) # Sınıflandırma sonuc = clf.classify(tum_kws, text, top_k=3) ms = int((time.time()-t0)*1000) return ClassifyResponse( prediction=sonuc['prediction'], confidence=round(sonuc['confidence'], 4), top_3=[ KategoriResponse( kategori=ks.kategori, guven=round(ks.final_skor, 4), keyword_skor=round(ks.keyword_skor, 4), semantic_skor=round(ks.semantic_skor, 4), eslesmeler=[e.keyword for e in ks.eslesmeler], ) for ks in sonuc['top_k'] ], extracted_keywords=tum_kws, processing_time_ms=ms, )