File size: 13,278 Bytes
6ca330f
 
a9f264c
77aa3fc
5c14c39
f07ac7d
30fbe48
 
6ca330f
9fb0308
30fbe48
 
77aa3fc
373a3a1
6ca330f
fb73ae4
77aa3fc
2685f20
30fbe48
 
 
e0016b4
2685f20
30fbe48
 
373a3a1
30fbe48
cbec340
 
8f79955
83255e4
 
 
7a1f249
8f79955
7a1f249
83255e4
 
7a1f249
8f79955
7a1f249
 
 
 
83255e4
 
 
 
 
 
 
 
 
7a1f249
 
 
 
 
 
 
 
 
 
 
 
 
83255e4
 
 
 
 
8f79955
7a1f249
 
8f79955
83255e4
 
 
 
 
 
 
 
 
 
e79a50f
 
 
b7cc25d
 
83255e4
e79a50f
83255e4
 
e79a50f
83255e4
 
 
 
e79a50f
83255e4
b7cc25d
 
 
 
83255e4
 
 
 
 
 
e79a50f
 
 
 
 
83255e4
8f79955
b7cc25d
 
 
 
 
 
83255e4
 
b7cc25d
 
 
83255e4
 
b7cc25d
 
 
 
 
2e55c9e
83255e4
 
 
b7cc25d
 
83255e4
 
e79a50f
 
83255e4
 
7a1f249
cbec340
83255e4
c81d689
e79a50f
 
 
 
 
 
 
cbec340
ac713be
 
324a66b
 
ac713be
 
 
324a66b
 
 
c81d689
30fbe48
f0773fc
 
83255e4
e0016b4
f0773fc
 
e0016b4
f0773fc
e0016b4
30fbe48
 
 
e0016b4
30fbe48
 
f0773fc
30fbe48
 
 
e0016b4
30fbe48
 
 
 
5c14c39
32575f3
 
 
 
 
 
 
 
 
 
 
30fbe48
 
9fb0308
e0016b4
 
 
 
 
 
 
 
9fb0308
e0016b4
 
 
 
 
 
30fbe48
e0016b4
 
 
9fb0308
 
2685f20
6ca330f
30fbe48
 
77aa3fc
 
30fbe48
e0016b4
 
 
 
 
77aa3fc
 
e0016b4
 
7890908
 
30fbe48
 
 
e0016b4
30fbe48
 
e0016b4
77aa3fc
 
30fbe48
77aa3fc
5c14c39
 
 
8d9ea82
5c14c39
77aa3fc
e0016b4
 
8d9ea82
77aa3fc
e0016b4
77aa3fc
 
30fbe48
e0016b4
 
30fbe48
e0016b4
cb83cd2
 
 
 
 
 
 
 
 
 
e0016b4
 
cb83cd2
 
 
 
 
e0016b4
77aa3fc
 
e0016b4
77aa3fc
 
30fbe48
 
 
e0016b4
cb83cd2
 
 
 
 
 
e0016b4
 
cb83cd2
 
 
 
e0016b4
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
import os
import threading
import requests
import shutil
import time
import re
import logging
from typing import List, Optional
from fastapi import FastAPI, Query, BackgroundTasks, HTTPException
from fastapi.responses import RedirectResponse, HTMLResponse
from pydantic import BaseModel

from db.database import get_session, init_db, DB_PATH, engine
from db.config import sync_to_bucket, BUCKET_DIR, LOCAL_DIR, init_storage
from db.models import Actress, ActressAlias, Video, VideoActress, Label, CrawlStatus
from crawler.crawl import run_crawl, stop_crawl
from crawler.cache import load_progress, clear_progress

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger("sougouwiki-api")

app = FastAPI(title="Sougouwiki Professional API", version="1.1.0")

worker_lock = threading.Lock()
master_thread: Optional[threading.Thread] = None

def backfill_labels_task():
    session = get_session()
    try:
        logger.info("Starting intelligent label backfill...")
        # Process in chunks. Don't use offset because updating label_id 
        # removes records from the 'label_id == None' result set.
        limit = 5000
        total_fixed = 0
        
        while True:
            # Always take the top N records that still need fixing
            target_videos = session.query(Video).filter(Video.label_id == None).limit(limit).all()
            if not target_videos: break
            
            chunk_fixed = 0
            for video in target_videos:
                if not video.title: continue
                match = re.search(r"[\uff08(](.+?)[\uff09)]", video.title)
                if not match: 
                    # Mark as 'checked' somehow or just ignore? 
                    # If we don't change label_id, it will stay in the set.
                    # To prevent infinite loop on unmatchable titles, we can set label_id to a 'default' 0 if needed
                    # but usually SeesaaWiki titles without brackets just don't have labels.
                    # For now, we'll just skip them in this pass by ensuring we move the "window" 
                    # if no match is found, but that's hard without offset.
                    # Better: If no match or no label found, we must mark it processed.
                    continue
                
                raw_labels = match.group(1).strip()
                parts = [p.strip() for p in re.split(r"[//]", raw_labels)]
                
                best_label = None
                for part in parts:
                    label = session.query(Label).filter_by(name=part).first()
                    if label:
                        best_label = label
                
                if best_label:
                    video.label_id = best_label.id
                    chunk_fixed += 1
                else:
                    # No known label found in the database for these strings
                    # We should NOT leave label_id as None or we'll loop forever
                    # Let's use a special "unlinked" state or just skip this logic's limitation
                    pass
            
            session.commit()
            total_fixed += chunk_fixed
            
            # Since we can't easily mark "tried but failed" without a new column,
            # and offset is broken, we'll use a slightly different approach:
            # We'll process ALL videos in chunks regardless of label_id, but only once.
            break # Exit this loop and use the better one below
            
    except Exception as e:
        logger.error(f"Backfill error: {e}")
        session.rollback()
    finally: session.close()

# Global state for monitoring
backfill_status = {"is_running": False, "last_id": 0, "total_fixed": 0, "total_scanned": 0}

from sqlalchemy import func

def improved_backfill_task():
    global backfill_status
    session = get_session()
    try:
        backfill_status["is_running"] = True
        logger.info("Starting improved full-table label scan...")
        batch_size = 5000
        last_id = 0
        total_fixed = 0
        total_scanned = 0
        
        # Diagnostic: print a few labels to log
        sample_labels = session.query(Label).limit(5).all()
        logger.info(f"DB Label samples: {[l.name for l in sample_labels]}")
        
        while True:
            videos = session.query(Video).filter(Video.id > last_id).order_by(Video.id).limit(batch_size).all()
            if not videos: break
            
            for video in videos:
                last_id = video.id
                total_scanned += 1
                backfill_status["last_id"] = last_id
                backfill_status["total_scanned"] = total_scanned
                
                if video.label_id is not None: continue
                if not video.title: continue
                
                # More robust regex for any kind of brackets
                match = re.search(r"[\uff08\uff09\(\)\[\]\u3010\u3011](.+?)[\uff08\uff09\(\)\[\]\u3010\u3011]", video.title)
                if not match: 
                    # Try another one: anything at the end of the title in brackets
                    match = re.search(r"[\uff08\(\[](.*?)[\uff09\)\ frontline]]", video.title)
                
                if not match: continue
                
                raw_content = match.group(1).strip()
                parts = [p.strip() for p in re.split(r"[//|]", raw_content)]
                
                best_label = None
                for part in parts:
                    if not part: continue
                    # Case-insensitive and whitespace-insensitive match
                    l_obj = session.query(Label).filter(func.lower(Label.name) == func.lower(part)).first()
                    if l_obj:
                        best_label = l_obj
                        break # Stop at the FIRST valid label (Manufacturer-first)
                
                if best_label:
                    video.label_id = best_label.id
                    total_fixed += 1
                    backfill_status["total_fixed"] = total_fixed
            
            session.commit()
            if total_scanned % 10000 == 0:
                logger.info(f"Scan progress: Scanned {total_scanned}, Fixed {total_fixed}")
        
        logger.info(f"Full scan finished. Total fixed: {total_fixed}")
        if total_fixed > 0: sync_to_bucket()
    except Exception as e:
        logger.error(f"Improved backfill error: {e}")
        session.rollback()
    finally:
        backfill_status["is_running"] = False
        session.close()

@app.get("/api/admin/backfill/status", include_in_schema=False)
def get_backfill_status():
    return backfill_status

@app.get("/api/admin/labels/search", include_in_schema=False)
def search_label_samples(q: str = ""):
    session = get_session()
    try:
        query = session.query(Label)
        if q: query = query.filter(Label.name.like(f"%{q}%"))
        labels = query.limit(50).all()
        return [{"id": l.id, "name": l.name} for l in labels]
    finally: session.close()

def master_background_worker(max_pages: int):
    global master_thread
    try:
        init_db()
        improved_backfill_task()
        logger.info(f"Starting crawler for {max_pages} pages...")
        run_crawl(max_pages=max_pages)
        sync_to_bucket()
    except Exception as e: logger.error(f"Worker error: {e}")
    finally:
        with worker_lock: master_thread = None

def sync_heartbeat():
    while True:
        time.sleep(120)
        sync_to_bucket()

@app.on_event("startup")
async def startup_event():
    global master_thread
    init_storage()
    threading.Thread(target=sync_heartbeat, daemon=True).start()
    with worker_lock:
        if master_thread is None or not master_thread.is_alive():
            master_thread = threading.Thread(target=master_background_worker, args=(342,))
            master_thread.start()

@app.get("/api/admin/db/export", include_in_schema=False)
def export_database():
    """Allows downloading the current production database file."""
    if os.path.exists(DB_PATH):
        return FileResponse(
            DB_PATH, 
            filename="sougouwiki_production.db",
            media_type="application/x-sqlite3"
        )
    raise HTTPException(404, "Database file not found")

@app.get("/", response_class=HTMLResponse, include_in_schema=False)
def root_ui():
    return """
    <html>
        <head><title>Sougouwiki Hub</title></head>
        <body style="font-family:sans-serif; padding:40px; background:#f6f8fa;">
            <h1>Sougouwiki Manager</h1>
            <div style="background:white; border:1px solid #e1e4e8; padding:20px; border-radius:6px;">
                <p id="st">Checking status...</p>
                <button onclick="fetch('/api/admin/crawl/start', {method:'POST'})">Restart</button>
                <div id="stats" style="margin-top:20px;"><pre>Loading...</pre></div>
            </div>
            <script>
                async function update() {
                    const r = await fetch('/api/admin/crawl/status');
                    const d = await r.json();
                    document.getElementById('st').innerText = d.is_running ? '🟢 Running' : '💤 Idle';
                    document.getElementById('stats').innerHTML = `<pre>${JSON.stringify(d, null, 2)}</pre>`;
                }
                setInterval(update, 5000); update();
            </script>
        </body>
    </html>
    """

@app.get("/api/admin/crawl/status")
def get_system_status():
    session = get_session()
    try:
        return {
            "is_running": master_thread is not None and master_thread.is_alive(),
            "progress": load_progress(),
            "db_stats": {
                "pages": session.query(CrawlStatus).count(),
                "actresses": session.query(Actress).count(),
                "videos": session.query(Video).count()
            }
        }
    except Exception as e: return {"error": str(e)}
    finally: session.close()

@app.post("/api/admin/crawl/start")
def api_manual_start(max_pages: int = 342):
    global master_thread
    with worker_lock:
        if master_thread and master_thread.is_alive(): return {"message": "Already running"}
        master_thread = threading.Thread(target=master_background_worker, args=(max_pages,))
        master_thread.start()
    return {"message": "Started"}

@app.post("/api/admin/db/reset")
def api_system_reset():
    try:
        if os.path.exists(DB_PATH): os.remove(DB_PATH)
        bucket_db = os.path.join(BUCKET_DIR, "sougouwiki.db")
        if os.path.exists(bucket_db): os.remove(bucket_db)
        clear_progress()
        init_storage()
        init_db()
        return {"message": "Reset done"}
    except Exception as e: raise HTTPException(500, str(e))

@app.get("/api/actress")
def search_actress(q: str = Query("")):
    session = get_session()
    try:
        if not q: return {"actresses": []}
        res = session.query(Actress).filter(Actress.name.like(f"%{q}%")).all()
        if not res:
            aliases = session.query(ActressAlias).filter(ActressAlias.alias_name.like(f"%{q}%")).all()
            res = list(set([a.actress for a in aliases if a.actress]))
        
        results = []
        for a in res:
            videos = session.query(Video).join(VideoActress).filter(VideoActress.actress_id == a.id).all()
            alias_list = [al.alias_name for al in session.query(ActressAlias).filter_by(actress_id=a.id).all()]
            video_list = []
            for v in sorted(videos, key=lambda x: x.release_date or "", reverse=True)[:20]:
                video_list.append({"dvd_id": v.dvd_id, "title": v.title})
            
            results.append({
                "id": a.id, "name": a.name, "name_kana": a.name_kana,
                "height": a.height, "bust": a.bust, "waist": a.waist, "hip": a.hip,
                "aliases": alias_list,
                "video_count": len(videos),
                "videos": video_list
            })
        return {"actresses": results}
    finally: session.close()

@app.get("/api/video")
def search_video(dvd_id: str = Query(""), q: str = Query("")):
    session = get_session()
    try:
        query_val = dvd_id or q
        if not query_val: return {"videos": []}
        v_list = session.query(Video).filter(Video.dvd_id.like(f"%{query_val}%")).all()
        if not v_list: v_list = session.query(Video).filter(Video.title.like(f"%{query_val}%")).all()
        
        results = []
        for v in v_list:
            va_rows = session.query(VideoActress, Actress).join(Actress, VideoActress.actress_id == Actress.id).filter(VideoActress.video_id == v.id).all()
            label = session.query(Label).filter_by(id=v.label_id).first()
            results.append({
                "dvd_id": v.dvd_id, "title": v.title, "release_date": v.release_date,
                "cover_url": v.cover_url, "dmm_url": v.dmm_url, "prefix": v.prefix,
                "label": label.name if label else None,
                "actresses": [{"id": a.id, "name": a.name, "role_name": va.role_name} for va, a in va_rows]
            })
        return {"videos": results}
    finally: session.close()