Engin34 commited on
Commit
e6967c7
Β·
verified Β·
1 Parent(s): d92e624

Upload 3 files

Browse files
Files changed (3) hide show
  1. Dockerfile +21 -0
  2. app.py +407 -0
  3. requirements.txt +9 -0
Dockerfile ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10-slim
2
+
3
+ WORKDIR /app
4
+
5
+ # Sistem bağımlılıkları
6
+ RUN apt-get update && apt-get install -y \
7
+ build-essential git curl \
8
+ && rm -rf /var/lib/apt/lists/*
9
+
10
+ # Python bağımlılıkları
11
+ COPY requirements.txt .
12
+ RUN pip install --no-cache-dir -r requirements.txt
13
+
14
+ # Uygulama dosyasΔ±
15
+ COPY app.py .
16
+
17
+ # Port
18
+ EXPOSE 7860
19
+
20
+ # Başlat
21
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
app.py ADDED
@@ -0,0 +1,407 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ LIFT UP Taksonomi SΔ±nΔ±flandΔ±rΔ±cΔ± β€” Backend API
3
+ Hugging Face Spaces (Docker) üzerinde çalışır.
4
+
5
+ Endpoint:
6
+ POST /classify
7
+ {
8
+ "baslik": "Proje başlığı",
9
+ "ozet": "Proje ΓΆzeti",
10
+ "keywords": ["opsiyonel", "liste"] # opsiyonel
11
+ }
12
+ β†’
13
+ {
14
+ "prediction": "Kompozit YapΔ±lar",
15
+ "confidence": 0.82,
16
+ "top_3": [...],
17
+ "extracted_keywords": [...],
18
+ "processing_time_ms": 1240
19
+ }
20
+ """
21
+
22
+ import os
23
+ import re
24
+ import time
25
+ import unicodedata
26
+ import logging
27
+ from contextlib import asynccontextmanager
28
+ from collections import Counter
29
+ from dataclasses import dataclass, field
30
+ from typing import Dict, List, Optional, Set
31
+
32
+ import numpy as np
33
+ import torch
34
+ import torch.nn as nn
35
+ from fastapi import FastAPI, HTTPException
36
+ from fastapi.middleware.cors import CORSMiddleware
37
+ from pydantic import BaseModel
38
+ from transformers import AutoTokenizer, AutoModel, AutoModelForSeq2SeqLM
39
+ from sentence_transformers import SentenceTransformer
40
+ from keybert import KeyBERT
41
+ from huggingface_hub import hf_hub_download, snapshot_download
42
+
43
+ logging.basicConfig(level=logging.INFO)
44
+ log = logging.getLogger("liftup")
45
+
46
+ HF_USERNAME = os.getenv("HF_USERNAME", "Engin34")
47
+ HF_TOKEN = os.getenv("HF_TOKEN", "") # Space secret olarak eklenecek
48
+
49
+ # ─── Global model değişkenleri ───────────────────────────────────────
50
+ bert_model = None
51
+ bert_tok = None
52
+ kw_model = None
53
+ generator = None
54
+ clf = None
55
+ TOP_KEYWORDS = None
56
+ device = torch.device("cpu")
57
+
58
+
59
+ # ═══════════════════════════════════════════════════════════════════
60
+ # TAKSONOMΔ° PARSER
61
+ # ═══════════════════════════════════════════════════════════════════
62
+ def _temizle(k):
63
+ k = k.replace('\u200b','').replace('\ufeff','')
64
+ k = unicodedata.normalize('NFKC', k)
65
+ return re.sub(r'\s+',' ', k).strip().lower()
66
+
67
+ def _parantez_ayir(k):
68
+ m = re.match(r'^(.+?)\s*\((.+?)\)\s*$', k)
69
+ if not m: return [k]
70
+ ana, ic = m.group(1).strip(), m.group(2).strip()
71
+ if any(a in ic.lower() for a in ['bağlam','kısmı','tarafı','proses','analiz','anahtarları','servisleme']):
72
+ return [ana]
73
+ return [ana] + [p.strip() for p in ic.split('/') if p.strip()]
74
+
75
+ def _virgul_ayir(metin):
76
+ sonuc, buf, d = [], [], 0
77
+ for c in metin:
78
+ if c == '(': d += 1; buf.append(c)
79
+ elif c == ')': d -= 1; buf.append(c)
80
+ elif c == ',' and d == 0: sonuc.append(''.join(buf)); buf = []
81
+ else: buf.append(c)
82
+ if buf: sonuc.append(''.join(buf))
83
+ return sonuc
84
+
85
+ def parse_taksonomi(icerik: str) -> Dict:
86
+ icerik = icerik.replace('\u200b','')
87
+ matches = list(re.finditer(r'^\s*(\d+)\)\s+(.+?)\s*$', icerik, re.MULTILINE))
88
+ tax = {}
89
+ for i, m in enumerate(matches):
90
+ kat = m.group(2).strip()
91
+ govde = icerik[m.end():(matches[i+1].start() if i+1 < len(matches) else len(icerik))].strip()
92
+ pm = re.search(r'\((.+)\)', govde, re.DOTALL)
93
+ if not pm: continue
94
+ kw_set = set()
95
+ for parca in _virgul_ayir(pm.group(1)):
96
+ for alt in _parantez_ayir(parca.strip()):
97
+ for k in re.split(r'[/]', alt):
98
+ temiz = _temizle(k)
99
+ if len(temiz) >= 2: kw_set.add(temiz)
100
+ tax[kat] = {'keywords': kw_set}
101
+ return tax
102
+
103
+
104
+ # ═══════════════════════════════════════════════════════════════════
105
+ # HΔ°BRΔ°T SINIFLANDIRICI
106
+ # ═══════════════════════════════════════════════════════════════════
107
+ @dataclass
108
+ class EslesmeBilgisi:
109
+ keyword: str; eslesme_tipi: str; eslesen_taksonomi_kw: str; puan: float
110
+
111
+ @dataclass
112
+ class KategoriSkoru:
113
+ kategori: str; final_skor: float; keyword_skor: float; semantic_skor: float
114
+ eslesmeler: list = field(default_factory=list)
115
+
116
+ class HibritSiniflandirici:
117
+ def __init__(self, taxonomy, embedder, keyword_weight=0.4, semantic_weight=0.6):
118
+ self.taxonomy = {c: {'keywords':{str(k).lower().strip() for k in d.get('keywords',set()) if str(k).strip()}}
119
+ for c,d in taxonomy.items()}
120
+ self.kw_w, self.sem_w = keyword_weight, semantic_weight
121
+ self.embedder = embedder
122
+ log.info("Centroid'ler hesaplanΔ±yor...")
123
+ self.centroids = self._centroids()
124
+ self.idf = self._idf()
125
+ log.info(f"HazΔ±r: {len(self.taxonomy)} kategori")
126
+
127
+ def _centroids(self):
128
+ c = {}
129
+ for cat, d in self.taxonomy.items():
130
+ kws = list(d['keywords'])
131
+ if not kws: c[cat]=None; continue
132
+ embs = self.embedder.encode(kws, show_progress_bar=False, convert_to_numpy=True)
133
+ v = np.mean(embs, axis=0); n = np.linalg.norm(v)
134
+ c[cat] = v/n if n>0 else v
135
+ return c
136
+
137
+ def _idf(self):
138
+ cnt = Counter()
139
+ for d in self.taxonomy.values():
140
+ for k in d['keywords']: cnt[k]+=1
141
+ N = len(self.taxonomy)
142
+ return {k: np.log(N/v)+1.0 for k,v in cnt.items()}
143
+
144
+ def _kw_score(self, extracted):
145
+ ext = [k.lower().strip() for k in extracted if k and str(k).strip()]
146
+ max_idf = max(self.idf.values(), default=1.0)
147
+ results = {}
148
+ for cat, d in self.taxonomy.items():
149
+ cat_kws = d['keywords']; score, eslm = 0.0, []
150
+ for kw in ext:
151
+ idf_w = self.idf.get(kw, 1.0)
152
+ if kw in cat_kws:
153
+ p=2.0*idf_w; score+=p; eslm.append(EslesmeBilgisi(kw,'exact',kw,p)); continue
154
+ if len(kw)<4: continue
155
+ for ck in cat_kws:
156
+ if len(ck)>=4 and (kw in ck or ck in kw):
157
+ p=1.0*idf_w; score+=p; eslm.append(EslesmeBilgisi(kw,'partial',ck,p)); break
158
+ max_p = max(len(ext)*2.0*max_idf, 1e-6)
159
+ results[cat] = (min(score/max_p,1.0), eslm)
160
+ return results
161
+
162
+ def _sem_score(self, extracted, text=None):
163
+ parts = []
164
+ if text and str(text).strip(): parts.append(str(text).strip())
165
+ if extracted: parts.append(" ".join(extracted))
166
+ if not parts: return {c:0.0 for c in self.taxonomy}
167
+ emb = self.embedder.encode([" | ".join(parts)], show_progress_bar=False, convert_to_numpy=True)[0]
168
+ n = np.linalg.norm(emb)
169
+ if n>0: emb=emb/n
170
+ return {c: max(0.0, min(1.0,(float(np.dot(emb,cn))+1.0)/2.0)) if cn is not None else 0.0
171
+ for c,cn in self.centroids.items()}
172
+
173
+ def classify(self, keywords, text=None, top_k=3):
174
+ kw_r = self._kw_score(keywords)
175
+ sem_s = self._sem_score(keywords, text)
176
+ ks = {}
177
+ for c in self.taxonomy:
178
+ kwn, esl = kw_r[c]
179
+ f = self.kw_w*kwn + self.sem_w*sem_s[c]
180
+ ks[c] = KategoriSkoru(c, f, kwn, sem_s[c], esl)
181
+ srt = sorted(ks.values(), key=lambda x: x.final_skor, reverse=True)
182
+ return {'prediction': srt[0].kategori, 'confidence': srt[0].final_skor, 'top_k': srt[:top_k]}
183
+
184
+
185
+ # ═══════════════════════════════════════════════════════════════════
186
+ # BERT MODEL
187
+ # ═══════════════════════════════════════════════════════════════════
188
+ class LiftUpBertModel(nn.Module):
189
+ def __init__(self, num_labels=128):
190
+ super().__init__()
191
+ self.bert = AutoModel.from_pretrained("dbmdz/bert-base-turkish-cased")
192
+ self.dropout = nn.Dropout(0.3)
193
+ self.classifier = nn.Linear(768, num_labels)
194
+ def forward(self, input_ids, attention_mask):
195
+ out = self.bert(input_ids=input_ids, attention_mask=attention_mask)
196
+ return self.classifier(self.dropout(out.last_hidden_state[:,0]))
197
+
198
+
199
+ # ═══════════════════════════════════════════════════════════════════
200
+ # POST-PROCESSOR
201
+ # ═══════════════════════════════════════════════════════════════════
202
+ class SoftPostProcessor:
203
+ def __init__(self):
204
+ self.blacklist = {'kombinatΓΌr','hesonomik','modΓΌlasyonlarΔ±','difΓΌzΓΆrlΓΌ','optimizasyonlarΔ±nΔ±'}
205
+ self.acronyms = {'CFD','FEA','CAD','ROS','CNN','AI','ML','DL','IoT','GPU','SSD'}
206
+ def is_acronym(self,w): return w.isupper() and 2<=len(w)<=5
207
+ def fix_case(self,kw):
208
+ out=[]
209
+ for w in kw.split():
210
+ if w.upper() in self.acronyms or self.is_acronym(w): out.append(w.upper())
211
+ elif not out: out.append(w.capitalize())
212
+ else: out.append(w.lower())
213
+ return ' '.join(out)
214
+ def should_filter(self,kw):
215
+ if kw.lower() in self.blacklist: return True
216
+ if not(3<=len(kw)<=80): return True
217
+ if re.search(r'[^a-zA-ZΓ§Γ‡ΔŸΔžΔ±Δ°ΓΆΓ–ΕŸΕžΓΌΓœ\s\-]',kw): return True
218
+ return False
219
+ def process(self,keywords,min_kw=3):
220
+ processed=[]
221
+ for kw in keywords:
222
+ if self.should_filter(kw): continue
223
+ fixed=self.fix_case(kw)
224
+ if not any(p.lower()==fixed.lower() for p in processed): processed.append(fixed)
225
+ return processed[:8] if processed else keywords[:3]
226
+
227
+
228
+ # ══════════════════════��════════════════════════════════════════════
229
+ # MODEL YÜKLEME (startup)
230
+ # ═══════════════════════════════════════════════════════════════════
231
+ def load_models():
232
+ global bert_model, bert_tok, kw_model, generator, clf, TOP_KEYWORDS
233
+
234
+ auth = {"token": HF_TOKEN} if HF_TOKEN else {}
235
+ log.info("Modeller yΓΌkleniyor...")
236
+
237
+ # 1) taksonomi
238
+ log.info("Taksonomi indiriliyor...")
239
+ tax_path = hf_hub_download(
240
+ repo_id=f"{HF_USERNAME}/liftup-bert",
241
+ filename="taksonomi.txt", **auth
242
+ )
243
+ with open(tax_path, encoding='utf-8') as f:
244
+ taxonomy = parse_taksonomi(f.read())
245
+
246
+ # 2) BERT checkpoint (TOP_KEYWORDS iΓ§in)
247
+ log.info("BERT checkpoint indiriliyor...")
248
+ ckpt_path = hf_hub_download(
249
+ repo_id=f"{HF_USERNAME}/liftup-bert",
250
+ filename="checkpoint.pth", **auth
251
+ )
252
+ ckpt = torch.load(ckpt_path, map_location="cpu")
253
+ TOP_KEYWORDS = ckpt["TOP_KEYWORDS"]
254
+
255
+ # 3) BERT model ağırlıkları
256
+ log.info("BERT model ağırlıkları indiriliyor (422 MB)...")
257
+ bert_path = hf_hub_download(
258
+ repo_id=f"{HF_USERNAME}/liftup-bert",
259
+ filename="best_bert_model.pth", **auth
260
+ )
261
+ bert_tok = AutoTokenizer.from_pretrained("dbmdz/bert-base-turkish-cased")
262
+ model = LiftUpBertModel(len(TOP_KEYWORDS))
263
+ model.load_state_dict(torch.load(bert_path, map_location="cpu"))
264
+ model.eval()
265
+ bert_model = model
266
+
267
+ # 4) KeyBERT (aynΔ± zamanda hibrit sΔ±nΔ±flandΔ±rΔ±cΔ±nΔ±n embedder'Δ±)
268
+ log.info("KeyBERT yΓΌkleniyor...")
269
+ kw_model = KeyBERT(model='sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')
270
+ embedder = kw_model.model.embedding_model
271
+
272
+ # 5) ByT5
273
+ log.info("ByT5 indiriliyor (1.1 GB)...")
274
+ byt5_dir = snapshot_download(
275
+ repo_id=f"{HF_USERNAME}/liftup-byt5", **auth
276
+ )
277
+ byt5_tok = AutoTokenizer.from_pretrained("google/byt5-small")
278
+ byt5_mdl = AutoModelForSeq2SeqLM.from_pretrained(byt5_dir)
279
+ byt5_mdl.eval()
280
+ post = SoftPostProcessor()
281
+
282
+ class Generator:
283
+ def __init__(self, tok, mdl, pp):
284
+ self.tok, self.mdl, self.pp = tok, mdl, pp
285
+ def generate(self, title="", abstract=""):
286
+ text = f"keywords: {title} {abstract}".strip()
287
+ inp = self.tok(text, max_length=512, truncation=True, return_tensors="pt")
288
+ with torch.no_grad():
289
+ out = self.mdl.generate(**inp, max_new_tokens=128, do_sample=False,
290
+ no_repeat_ngram_size=4, repetition_penalty=1.5)
291
+ pred = self.tok.decode(out[0], skip_special_tokens=True)
292
+ if pred.lower().startswith("keywords:"): pred=pred[9:].strip()
293
+ kws = [k.strip() for k in pred.split(';') if k.strip()]
294
+ return self.pp.process(kws)
295
+
296
+ generator = Generator(byt5_tok, byt5_mdl, post)
297
+
298
+ # 6) Hibrit sΔ±nΔ±flandΔ±rΔ±cΔ±
299
+ log.info("Hibrit sınıflandırıcı başlatılıyor...")
300
+ clf = HibritSiniflandirici(taxonomy, embedder)
301
+ log.info("βœ… TΓΌm modeller hazΔ±r!")
302
+
303
+
304
+ # ═══════════════════════════════════════════════════════════════════
305
+ # FASTAPI
306
+ # ═══════════════════════════════════════════════════════════════════
307
+ @asynccontextmanager
308
+ async def lifespan(app: FastAPI):
309
+ load_models()
310
+ yield
311
+
312
+ app = FastAPI(title="LIFT UP SΔ±nΔ±flandΔ±rΔ±cΔ±", lifespan=lifespan)
313
+
314
+ app.add_middleware(
315
+ CORSMiddleware,
316
+ allow_origins=["*"],
317
+ allow_methods=["POST","GET"],
318
+ allow_headers=["*"],
319
+ )
320
+
321
+ class ClassifyRequest(BaseModel):
322
+ baslik: str
323
+ ozet: str
324
+ keywords: Optional[List[str]] = None
325
+
326
+
327
+ class KategoriResponse(BaseModel):
328
+ kategori: str
329
+ guven: float
330
+ keyword_skor: float
331
+ semantic_skor: float
332
+ eslesmeler: List[str]
333
+
334
+
335
+ class ClassifyResponse(BaseModel):
336
+ prediction: str
337
+ confidence: float
338
+ top_3: List[KategoriResponse]
339
+ extracted_keywords: List[str]
340
+ processing_time_ms: int
341
+
342
+
343
+ def bert_extract(text):
344
+ enc = bert_tok(str(text).lower(), truncation=True, padding='max_length',
345
+ max_length=256, return_tensors='pt')
346
+ with torch.no_grad():
347
+ logits = bert_model(enc['input_ids'], enc['attention_mask'])
348
+ probs = torch.sigmoid(logits)[0].numpy()
349
+ idxs = np.argsort(probs)[-10:][::-1]
350
+ return [TOP_KEYWORDS[i] for i in idxs if probs[i]>0.01][:5]
351
+
352
+ def keybert_extract(text):
353
+ clean = re.sub(r'[^\w\sΔŸΓΌΕŸΔ±ΓΆΓ§ΔžΓœΕžΔ°Γ–Γ‡]',' ', text.lower()).strip()
354
+ try:
355
+ kws = kw_model.extract_keywords(clean, keyphrase_ngram_range=(1,3),
356
+ top_n=5, use_mmr=True, diversity=0.2)
357
+ return [k[0] for k in kws][:3]
358
+ except:
359
+ return []
360
+
361
+
362
+ @app.get("/health")
363
+ def health(): return {"status": "ok"}
364
+
365
+
366
+ @app.get("/")
367
+ def root(): return {"message": "LIFT UP API çalışıyor", "endpoint": "POST /classify"}
368
+
369
+
370
+ @app.post("/classify", response_model=ClassifyResponse)
371
+ def classify(req: ClassifyRequest):
372
+ if not req.baslik.strip() or not req.ozet.strip():
373
+ raise HTTPException(400, "Başlık ve âzet zorunludur")
374
+
375
+ t0 = time.time()
376
+ text = f"{req.baslik} {req.ozet}"
377
+
378
+ # Keyword extraction
379
+ bert_kws = bert_extract(text)
380
+ kb_kws = keybert_extract(text)
381
+ byt5_kws = generator.generate(req.baslik, req.ozet)
382
+
383
+ # KullanΔ±cΔ± keyword'leri varsa ekle
384
+ extra = req.keywords or []
385
+ tum_kws = list(dict.fromkeys(bert_kws + kb_kws + byt5_kws + extra))
386
+
387
+ # SΔ±nΔ±flandΔ±rma
388
+ sonuc = clf.classify(tum_kws, text, top_k=3)
389
+
390
+ ms = int((time.time()-t0)*1000)
391
+
392
+ return ClassifyResponse(
393
+ prediction=sonuc['prediction'],
394
+ confidence=round(sonuc['confidence'], 4),
395
+ top_3=[
396
+ KategoriResponse(
397
+ kategori=ks.kategori,
398
+ guven=round(ks.final_skor, 4),
399
+ keyword_skor=round(ks.keyword_skor, 4),
400
+ semantic_skor=round(ks.semantic_skor, 4),
401
+ eslesmeler=[e.keyword for e in ks.eslesmeler],
402
+ )
403
+ for ks in sonuc['top_k']
404
+ ],
405
+ extracted_keywords=tum_kws,
406
+ processing_time_ms=ms,
407
+ )
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ fastapi==0.111.0
2
+ uvicorn==0.29.0
3
+ torch==2.2.0
4
+ transformers==4.40.0
5
+ sentence-transformers==2.7.0
6
+ keybert==0.8.4
7
+ huggingface-hub==0.22.2
8
+ numpy==1.26.4
9
+ pydantic==2.7.0