Spaces:
Paused
Paused
File size: 6,470 Bytes
40ee4e3 446d594 40ee4e3 9caf1de 40ee4e3 607e69d | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 | import os
import re as _re
import json
import asyncio as _asyncio
import aiosqlite
GROQ_API_KEY = os.environ.get("GROQ_API_KEY", "")
FAL_KEY = os.environ.get("FAL_KEY", "")
HF_TOKEN = os.environ.get("HF_TOKEN", "")
ADMIN_EMAIL = "arxivgpt@gmail.com"
SA_BACKUP_REPO = "ginigen-ai/siteagent-db"
DATASET_ID = "ginigen-ai/siteagent"
CLIP_BUCKET = "ginigen-ai/siteagent"
CLIP_MAX_BYTES = 1 * 1024 * 1024 * 1024
NAVER_CLIENT_ID = os.environ.get("NAVER_CLIENT_ID", "")
NAVER_CLIENT_SECRET = os.environ.get("NAVER_CLIENT_SECRET", "")
GROQ_URL = "https://api.groq.com/openai/v1/chat/completions"
def _get_db_path():
for p in ["/data/siteagent.db", "./siteagent.db"]:
d = os.path.dirname(p)
if d and os.path.isdir(d):
return p
if not d:
return p
return "./siteagent.db"
SA_DB_PATH = _get_db_path()
_db_lock = None
def _get_lock():
global _db_lock
if _db_lock is None: _db_lock = _asyncio.Lock()
return _db_lock
_write_count = 0
_write_queue = None
async def _init_write_queue():
global _write_queue
if _write_queue is None:
_write_queue = _asyncio.Queue(maxsize=500)
_asyncio.create_task(_db_write_worker())
async def _db_write_worker():
while True:
try:
sql, params = await _write_queue.get()
try:
if params:
params = tuple(_sanitize_text(p) if isinstance(p, str) else p for p in params)
async with aiosqlite.connect(SA_DB_PATH, timeout=10.0) as db:
await db.execute("PRAGMA journal_mode=WAL")
await db.execute(sql, params or ())
await db.commit()
global _write_count
_write_count += 1
if _write_count % 100 == 0:
_asyncio.create_task(_sa_backup_db())
except Exception as e:
print(f"[db-worker] {e}")
finally:
_write_queue.task_done()
except _asyncio.CancelledError:
break
except Exception as e:
print(f"[db-worker-fatal] {e}")
await _asyncio.sleep(0.1)
def _db_enqueue(sql, params=None):
if _write_queue is None: return
try:
_write_queue.put_nowait((sql, params))
except _asyncio.QueueFull:
try:
_write_queue.get_nowait()
_write_queue.put_nowait((sql, params))
except: pass
async def _db_write(sql, params=None):
_db_enqueue(sql, params)
async def _db_write_sync(sql, params=None):
async with aiosqlite.connect(SA_DB_PATH, timeout=10.0) as db:
await db.execute("PRAGMA journal_mode=WAL")
cursor = await db.execute(sql, params or ())
await db.commit()
return cursor
async def _db_read(sql, params=None):
async with aiosqlite.connect(SA_DB_PATH, timeout=10.0) as db:
db.row_factory = aiosqlite.Row
cursor = await db.execute(sql, params or ())
return await cursor.fetchall()
async def _db_read_one(sql, params=None):
async with aiosqlite.connect(SA_DB_PATH, timeout=10.0) as db:
db.row_factory = aiosqlite.Row
cursor = await db.execute(sql, params or ())
return await cursor.fetchone()
async def _sa_backup_db():
if not HF_TOKEN or not os.path.exists(SA_DB_PATH): return
try:
import sqlite3
conn = sqlite3.connect(SA_DB_PATH)
result = conn.execute("PRAGMA integrity_check").fetchone()
conn.close()
if result[0] != "ok":
print(f"⚠️ DB integrity issue: {result[0]}")
return
from datetime import datetime, timezone
ts = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
from huggingface_hub import HfApi
api = HfApi(token=HF_TOKEN)
try: api.create_repo(repo_id=SA_BACKUP_REPO, repo_type="dataset", private=True, exist_ok=True)
except: pass
api.upload_file(path_or_fileobj=SA_DB_PATH, path_in_repo=f"backup/siteagent_{ts}.db", repo_id=SA_BACKUP_REPO, repo_type="dataset")
api.upload_file(path_or_fileobj=SA_DB_PATH, path_in_repo="latest/siteagent.db", repo_id=SA_BACKUP_REPO, repo_type="dataset")
print(f"✅ [SA-backup] Hub backup: {ts}")
except Exception as e:
print(f"⚠️ [SA-backup] {e}")
def _sanitize_text(text):
if not isinstance(text, str):
return text
clean = _re.sub(r'[\ud800-\udfff]', '', text)
try:
clean.encode('utf-8')
return clean
except UnicodeEncodeError:
return clean.encode('utf-8', 'replace').decode('utf-8')
def _sanitize_messages(messages):
if not messages:
return messages
clean = []
for msg in messages:
m = dict(msg)
if isinstance(m.get("content"), str):
m["content"] = _sanitize_text(m["content"])
elif isinstance(m.get("content"), list):
m["content"] = [
{**item, "text": _sanitize_text(item["text"])} if isinstance(item, dict) and "text" in item else item
for item in m["content"]
]
clean.append(m)
return clean
def _flush_table(tbl, out):
if len(tbl) < 2:
out.extend(tbl)
return
headers = [c.strip() for c in tbl[0].strip('|').split('|')]
for i, tr in enumerate(tbl):
cells = [c.strip() for c in tr.strip('|').split('|')]
if i == 0:
out.append('**' + ' · '.join(c for c in cells if c) + '**')
elif i == 1 and _re.match(r'^[\s:\-|]+$', tr):
continue
else:
parts = []
for j, cell in enumerate(cells):
if cell:
if j < len(headers) and headers[j] and headers[j] != cell:
parts.append(f'**{headers[j]}**: {cell}')
else:
parts.append(cell)
out.append('• ' + ' / '.join(parts))
def _strip_md_table(text):
if not isinstance(text, str) or '|' not in text:
return text
lines = text.split('\n')
out = []
tbl = []
in_tbl = False
for line in lines:
if _re.match(r'^\s*\|.+\|\s*$', line):
in_tbl = True
tbl.append(line)
else:
if in_tbl:
_flush_table(tbl, out)
tbl = []
in_tbl = False
out.append(line)
if in_tbl:
_flush_table(tbl, out)
return '\n'.join(out) |