File size: 13,278 Bytes
6ca330f a9f264c 77aa3fc 5c14c39 f07ac7d 30fbe48 6ca330f 9fb0308 30fbe48 77aa3fc 373a3a1 6ca330f fb73ae4 77aa3fc 2685f20 30fbe48 e0016b4 2685f20 30fbe48 373a3a1 30fbe48 cbec340 8f79955 83255e4 7a1f249 8f79955 7a1f249 83255e4 7a1f249 8f79955 7a1f249 83255e4 7a1f249 83255e4 8f79955 7a1f249 8f79955 83255e4 e79a50f b7cc25d 83255e4 e79a50f 83255e4 e79a50f 83255e4 e79a50f 83255e4 b7cc25d 83255e4 e79a50f 83255e4 8f79955 b7cc25d 83255e4 b7cc25d 83255e4 b7cc25d 2e55c9e 83255e4 b7cc25d 83255e4 e79a50f 83255e4 7a1f249 cbec340 83255e4 c81d689 e79a50f cbec340 ac713be 324a66b ac713be 324a66b c81d689 30fbe48 f0773fc 83255e4 e0016b4 f0773fc e0016b4 f0773fc e0016b4 30fbe48 e0016b4 30fbe48 f0773fc 30fbe48 e0016b4 30fbe48 5c14c39 32575f3 30fbe48 9fb0308 e0016b4 9fb0308 e0016b4 30fbe48 e0016b4 9fb0308 2685f20 6ca330f 30fbe48 77aa3fc 30fbe48 e0016b4 77aa3fc e0016b4 7890908 30fbe48 e0016b4 30fbe48 e0016b4 77aa3fc 30fbe48 77aa3fc 5c14c39 8d9ea82 5c14c39 77aa3fc e0016b4 8d9ea82 77aa3fc e0016b4 77aa3fc 30fbe48 e0016b4 30fbe48 e0016b4 cb83cd2 e0016b4 cb83cd2 e0016b4 77aa3fc e0016b4 77aa3fc 30fbe48 e0016b4 cb83cd2 e0016b4 cb83cd2 e0016b4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 | import os
import threading
import requests
import shutil
import time
import re
import logging
from typing import List, Optional
from fastapi import FastAPI, Query, BackgroundTasks, HTTPException
from fastapi.responses import RedirectResponse, HTMLResponse
from pydantic import BaseModel
from db.database import get_session, init_db, DB_PATH, engine
from db.config import sync_to_bucket, BUCKET_DIR, LOCAL_DIR, init_storage
from db.models import Actress, ActressAlias, Video, VideoActress, Label, CrawlStatus
from crawler.crawl import run_crawl, stop_crawl
from crawler.cache import load_progress, clear_progress
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger("sougouwiki-api")
app = FastAPI(title="Sougouwiki Professional API", version="1.1.0")
worker_lock = threading.Lock()
master_thread: Optional[threading.Thread] = None
def backfill_labels_task():
session = get_session()
try:
logger.info("Starting intelligent label backfill...")
# Process in chunks. Don't use offset because updating label_id
# removes records from the 'label_id == None' result set.
limit = 5000
total_fixed = 0
while True:
# Always take the top N records that still need fixing
target_videos = session.query(Video).filter(Video.label_id == None).limit(limit).all()
if not target_videos: break
chunk_fixed = 0
for video in target_videos:
if not video.title: continue
match = re.search(r"[\uff08(](.+?)[\uff09)]", video.title)
if not match:
# Mark as 'checked' somehow or just ignore?
# If we don't change label_id, it will stay in the set.
# To prevent infinite loop on unmatchable titles, we can set label_id to a 'default' 0 if needed
# but usually SeesaaWiki titles without brackets just don't have labels.
# For now, we'll just skip them in this pass by ensuring we move the "window"
# if no match is found, but that's hard without offset.
# Better: If no match or no label found, we must mark it processed.
continue
raw_labels = match.group(1).strip()
parts = [p.strip() for p in re.split(r"[//]", raw_labels)]
best_label = None
for part in parts:
label = session.query(Label).filter_by(name=part).first()
if label:
best_label = label
if best_label:
video.label_id = best_label.id
chunk_fixed += 1
else:
# No known label found in the database for these strings
# We should NOT leave label_id as None or we'll loop forever
# Let's use a special "unlinked" state or just skip this logic's limitation
pass
session.commit()
total_fixed += chunk_fixed
# Since we can't easily mark "tried but failed" without a new column,
# and offset is broken, we'll use a slightly different approach:
# We'll process ALL videos in chunks regardless of label_id, but only once.
break # Exit this loop and use the better one below
except Exception as e:
logger.error(f"Backfill error: {e}")
session.rollback()
finally: session.close()
# Global state for monitoring
backfill_status = {"is_running": False, "last_id": 0, "total_fixed": 0, "total_scanned": 0}
from sqlalchemy import func
def improved_backfill_task():
global backfill_status
session = get_session()
try:
backfill_status["is_running"] = True
logger.info("Starting improved full-table label scan...")
batch_size = 5000
last_id = 0
total_fixed = 0
total_scanned = 0
# Diagnostic: print a few labels to log
sample_labels = session.query(Label).limit(5).all()
logger.info(f"DB Label samples: {[l.name for l in sample_labels]}")
while True:
videos = session.query(Video).filter(Video.id > last_id).order_by(Video.id).limit(batch_size).all()
if not videos: break
for video in videos:
last_id = video.id
total_scanned += 1
backfill_status["last_id"] = last_id
backfill_status["total_scanned"] = total_scanned
if video.label_id is not None: continue
if not video.title: continue
# More robust regex for any kind of brackets
match = re.search(r"[\uff08\uff09\(\)\[\]\u3010\u3011](.+?)[\uff08\uff09\(\)\[\]\u3010\u3011]", video.title)
if not match:
# Try another one: anything at the end of the title in brackets
match = re.search(r"[\uff08\(\[](.*?)[\uff09\)\ frontline]]", video.title)
if not match: continue
raw_content = match.group(1).strip()
parts = [p.strip() for p in re.split(r"[//|]", raw_content)]
best_label = None
for part in parts:
if not part: continue
# Case-insensitive and whitespace-insensitive match
l_obj = session.query(Label).filter(func.lower(Label.name) == func.lower(part)).first()
if l_obj:
best_label = l_obj
break # Stop at the FIRST valid label (Manufacturer-first)
if best_label:
video.label_id = best_label.id
total_fixed += 1
backfill_status["total_fixed"] = total_fixed
session.commit()
if total_scanned % 10000 == 0:
logger.info(f"Scan progress: Scanned {total_scanned}, Fixed {total_fixed}")
logger.info(f"Full scan finished. Total fixed: {total_fixed}")
if total_fixed > 0: sync_to_bucket()
except Exception as e:
logger.error(f"Improved backfill error: {e}")
session.rollback()
finally:
backfill_status["is_running"] = False
session.close()
@app.get("/api/admin/backfill/status", include_in_schema=False)
def get_backfill_status():
return backfill_status
@app.get("/api/admin/labels/search", include_in_schema=False)
def search_label_samples(q: str = ""):
session = get_session()
try:
query = session.query(Label)
if q: query = query.filter(Label.name.like(f"%{q}%"))
labels = query.limit(50).all()
return [{"id": l.id, "name": l.name} for l in labels]
finally: session.close()
def master_background_worker(max_pages: int):
global master_thread
try:
init_db()
improved_backfill_task()
logger.info(f"Starting crawler for {max_pages} pages...")
run_crawl(max_pages=max_pages)
sync_to_bucket()
except Exception as e: logger.error(f"Worker error: {e}")
finally:
with worker_lock: master_thread = None
def sync_heartbeat():
while True:
time.sleep(120)
sync_to_bucket()
@app.on_event("startup")
async def startup_event():
global master_thread
init_storage()
threading.Thread(target=sync_heartbeat, daemon=True).start()
with worker_lock:
if master_thread is None or not master_thread.is_alive():
master_thread = threading.Thread(target=master_background_worker, args=(342,))
master_thread.start()
@app.get("/api/admin/db/export", include_in_schema=False)
def export_database():
"""Allows downloading the current production database file."""
if os.path.exists(DB_PATH):
return FileResponse(
DB_PATH,
filename="sougouwiki_production.db",
media_type="application/x-sqlite3"
)
raise HTTPException(404, "Database file not found")
@app.get("/", response_class=HTMLResponse, include_in_schema=False)
def root_ui():
return """
<html>
<head><title>Sougouwiki Hub</title></head>
<body style="font-family:sans-serif; padding:40px; background:#f6f8fa;">
<h1>Sougouwiki Manager</h1>
<div style="background:white; border:1px solid #e1e4e8; padding:20px; border-radius:6px;">
<p id="st">Checking status...</p>
<button onclick="fetch('/api/admin/crawl/start', {method:'POST'})">Restart</button>
<div id="stats" style="margin-top:20px;"><pre>Loading...</pre></div>
</div>
<script>
async function update() {
const r = await fetch('/api/admin/crawl/status');
const d = await r.json();
document.getElementById('st').innerText = d.is_running ? '🟢 Running' : '💤 Idle';
document.getElementById('stats').innerHTML = `<pre>${JSON.stringify(d, null, 2)}</pre>`;
}
setInterval(update, 5000); update();
</script>
</body>
</html>
"""
@app.get("/api/admin/crawl/status")
def get_system_status():
session = get_session()
try:
return {
"is_running": master_thread is not None and master_thread.is_alive(),
"progress": load_progress(),
"db_stats": {
"pages": session.query(CrawlStatus).count(),
"actresses": session.query(Actress).count(),
"videos": session.query(Video).count()
}
}
except Exception as e: return {"error": str(e)}
finally: session.close()
@app.post("/api/admin/crawl/start")
def api_manual_start(max_pages: int = 342):
global master_thread
with worker_lock:
if master_thread and master_thread.is_alive(): return {"message": "Already running"}
master_thread = threading.Thread(target=master_background_worker, args=(max_pages,))
master_thread.start()
return {"message": "Started"}
@app.post("/api/admin/db/reset")
def api_system_reset():
try:
if os.path.exists(DB_PATH): os.remove(DB_PATH)
bucket_db = os.path.join(BUCKET_DIR, "sougouwiki.db")
if os.path.exists(bucket_db): os.remove(bucket_db)
clear_progress()
init_storage()
init_db()
return {"message": "Reset done"}
except Exception as e: raise HTTPException(500, str(e))
@app.get("/api/actress")
def search_actress(q: str = Query("")):
session = get_session()
try:
if not q: return {"actresses": []}
res = session.query(Actress).filter(Actress.name.like(f"%{q}%")).all()
if not res:
aliases = session.query(ActressAlias).filter(ActressAlias.alias_name.like(f"%{q}%")).all()
res = list(set([a.actress for a in aliases if a.actress]))
results = []
for a in res:
videos = session.query(Video).join(VideoActress).filter(VideoActress.actress_id == a.id).all()
alias_list = [al.alias_name for al in session.query(ActressAlias).filter_by(actress_id=a.id).all()]
video_list = []
for v in sorted(videos, key=lambda x: x.release_date or "", reverse=True)[:20]:
video_list.append({"dvd_id": v.dvd_id, "title": v.title})
results.append({
"id": a.id, "name": a.name, "name_kana": a.name_kana,
"height": a.height, "bust": a.bust, "waist": a.waist, "hip": a.hip,
"aliases": alias_list,
"video_count": len(videos),
"videos": video_list
})
return {"actresses": results}
finally: session.close()
@app.get("/api/video")
def search_video(dvd_id: str = Query(""), q: str = Query("")):
session = get_session()
try:
query_val = dvd_id or q
if not query_val: return {"videos": []}
v_list = session.query(Video).filter(Video.dvd_id.like(f"%{query_val}%")).all()
if not v_list: v_list = session.query(Video).filter(Video.title.like(f"%{query_val}%")).all()
results = []
for v in v_list:
va_rows = session.query(VideoActress, Actress).join(Actress, VideoActress.actress_id == Actress.id).filter(VideoActress.video_id == v.id).all()
label = session.query(Label).filter_by(id=v.label_id).first()
results.append({
"dvd_id": v.dvd_id, "title": v.title, "release_date": v.release_date,
"cover_url": v.cover_url, "dmm_url": v.dmm_url, "prefix": v.prefix,
"label": label.name if label else None,
"actresses": [{"id": a.id, "name": a.name, "role_name": va.role_name} for va, a in va_rows]
})
return {"videos": results}
finally: session.close()
|