Spaces:
Running
Running
Upload app.py with huggingface_hub
Browse files
app.py
CHANGED
|
@@ -18,6 +18,7 @@ import aiosqlite
|
|
| 18 |
import trafilatura
|
| 19 |
from bs4 import BeautifulSoup
|
| 20 |
from duckduckgo_search import DDGS
|
|
|
|
| 21 |
from fastapi import FastAPI, Request
|
| 22 |
from fastapi.responses import JSONResponse, HTMLResponse
|
| 23 |
from huggingface_hub import AsyncInferenceClient
|
|
@@ -61,6 +62,9 @@ def get_system_prompt() -> str:
|
|
| 61 |
)
|
| 62 |
|
| 63 |
URL_PATTERN = re.compile(r'https?://[^\s<>"\']+')
|
|
|
|
|
|
|
|
|
|
| 64 |
SEARCH_TRIGGERS = ["κ²μ", "μ°Ύμ", "μμλ΄", "μλ €μ€", "μ‘°μ¬", "search", "look up", "find"]
|
| 65 |
|
| 66 |
|
|
@@ -269,12 +273,70 @@ async def fetch_url_content(url: str, max_chars: int = 3000) -> str:
|
|
| 269 |
return f"URL μ½κΈ° μ€ν¨: {str(e)[:100]}"
|
| 270 |
|
| 271 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 272 |
# ===== Message Routing & LLM =====
|
| 273 |
|
| 274 |
def detect_message_type(text: str) -> str:
|
| 275 |
"""Determine how to handle the message."""
|
| 276 |
if text.startswith("/"):
|
| 277 |
return "command"
|
|
|
|
|
|
|
| 278 |
if URL_PATTERN.search(text):
|
| 279 |
return "url"
|
| 280 |
if text.startswith("/search") or extract_search_query(text):
|
|
@@ -337,6 +399,7 @@ async def handle_command(chat_id: int, text: str) -> JSONResponse:
|
|
| 337 |
"κΈ°λ₯:\n"
|
| 338 |
"- λ©μμ§λ₯Ό 보λ΄λ©΄ AIκ° λ΅λ³\n"
|
| 339 |
"- URLμ 보λ΄λ©΄ μλ μμ½\n"
|
|
|
|
| 340 |
"- 'κ²μ' ν€μλ ν¬ν¨ μ μΉ κ²μ\n"
|
| 341 |
"- /search <κ²μμ΄> - μΉ κ²μ\n\n"
|
| 342 |
"μ€μ :\n"
|
|
@@ -367,7 +430,7 @@ async def handle_command(chat_id: int, text: str) -> JSONResponse:
|
|
| 367 |
f"μ΄ λ©μμ§: {stats['total_messages']}κ°\n"
|
| 368 |
f"첫 λν: {stats['first'] or 'μμ'}\n"
|
| 369 |
f"μ΅κ·Ό λν: {stats['last'] or 'μμ'}\n\n"
|
| 370 |
-
f"κΈ°λ₯: LLM λν, μΉ κ²μ, URL μμ½, μꡬ κΈ°μ΅")
|
| 371 |
|
| 372 |
if cmd == "/help":
|
| 373 |
return reply_msg(chat_id,
|
|
@@ -375,6 +438,7 @@ async def handle_command(chat_id: int, text: str) -> JSONResponse:
|
|
| 375 |
"λν: λ©μμ§λ₯Ό 보λ΄λ©΄ AIκ° λ΅λ³ν©λλ€.\n"
|
| 376 |
"κ²μ: 'λΉνΈμ½μΈ μμΈ κ²μν΄μ€' λλ /search λΉνΈμ½μΈ μμΈ\n"
|
| 377 |
"URL μμ½: URLμ 보λ΄λ©΄ μλμΌλ‘ λ΄μ© μμ½\n"
|
|
|
|
| 378 |
"κΈ°μ΅: λν λ΄μ©μ κΈ°μ΅ν©λλ€ (μλ² μ¬μμ νμλ μ μ§)\n\n"
|
| 379 |
"λͺ
λ Ήμ΄:\n"
|
| 380 |
"/search <κ²μμ΄> - μΉ κ²μ\n"
|
|
@@ -432,6 +496,49 @@ async def handle_url(chat_id: int, text: str) -> JSONResponse:
|
|
| 432 |
return reply_msg(chat_id, llm_response)
|
| 433 |
|
| 434 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 435 |
async def handle_chat(chat_id: int, text: str) -> JSONResponse:
|
| 436 |
"""Normal LLM chat with history."""
|
| 437 |
# Check for search triggers
|
|
@@ -492,19 +599,20 @@ async def root():
|
|
| 492 |
return HTMLResponse(
|
| 493 |
"<html><body style='font-family:system-ui;max-width:600px;margin:60px auto;padding:0 20px'>"
|
| 494 |
"<h1>Clawdbot AI Secretary</h1>"
|
| 495 |
-
"<p>Telegram bot + Web Search + URL Reader + Persistent Memory</p>"
|
| 496 |
"<p style='color:green;font-weight:bold'>ONLINE</p>"
|
| 497 |
"<h3>Features</h3><ul>"
|
| 498 |
"<li>LLM Chat (5 models)</li>"
|
| 499 |
"<li>DuckDuckGo Web Search</li>"
|
| 500 |
"<li>URL Content Summarizer</li>"
|
|
|
|
| 501 |
"<li>SQLite Persistent Memory</li>"
|
| 502 |
"</ul></body></html>")
|
| 503 |
|
| 504 |
|
| 505 |
@app.get("/health")
|
| 506 |
async def health():
|
| 507 |
-
return {"status": "ok", "features": ["chat", "search", "url", "memory"]}
|
| 508 |
|
| 509 |
|
| 510 |
@app.post("/webhook")
|
|
@@ -528,6 +636,8 @@ async def webhook(request: Request):
|
|
| 528 |
msg_type = detect_message_type(text)
|
| 529 |
if msg_type == "command":
|
| 530 |
return await handle_command(chat_id, text)
|
|
|
|
|
|
|
| 531 |
if msg_type == "url":
|
| 532 |
return await handle_url(chat_id, text)
|
| 533 |
return await handle_chat(chat_id, text)
|
|
|
|
| 18 |
import trafilatura
|
| 19 |
from bs4 import BeautifulSoup
|
| 20 |
from duckduckgo_search import DDGS
|
| 21 |
+
from youtube_transcript_api import YouTubeTranscriptApi
|
| 22 |
from fastapi import FastAPI, Request
|
| 23 |
from fastapi.responses import JSONResponse, HTMLResponse
|
| 24 |
from huggingface_hub import AsyncInferenceClient
|
|
|
|
| 62 |
)
|
| 63 |
|
| 64 |
URL_PATTERN = re.compile(r'https?://[^\s<>"\']+')
|
| 65 |
+
YOUTUBE_PATTERN = re.compile(
|
| 66 |
+
r'(?:https?://)?(?:www\.)?(?:youtube\.com/watch\?v=|youtu\.be/|youtube\.com/shorts/)([a-zA-Z0-9_-]{11})'
|
| 67 |
+
)
|
| 68 |
SEARCH_TRIGGERS = ["κ²μ", "μ°Ύμ", "μμλ΄", "μλ €μ€", "μ‘°μ¬", "search", "look up", "find"]
|
| 69 |
|
| 70 |
|
|
|
|
| 273 |
return f"URL μ½κΈ° μ€ν¨: {str(e)[:100]}"
|
| 274 |
|
| 275 |
|
| 276 |
+
# ===== YouTube Transcript =====
|
| 277 |
+
|
| 278 |
+
def _fetch_transcript(video_id: str) -> str:
|
| 279 |
+
"""Fetch YouTube transcript (Korean > English > any)."""
|
| 280 |
+
try:
|
| 281 |
+
transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
|
| 282 |
+
|
| 283 |
+
# Priority: Korean manual > Korean auto > English > any
|
| 284 |
+
transcript = None
|
| 285 |
+
for lang_codes in [["ko"], ["en"]]:
|
| 286 |
+
try:
|
| 287 |
+
transcript = transcript_list.find_transcript(lang_codes)
|
| 288 |
+
break
|
| 289 |
+
except Exception:
|
| 290 |
+
continue
|
| 291 |
+
|
| 292 |
+
if transcript is None:
|
| 293 |
+
# Fallback: first available, translated to Korean if possible
|
| 294 |
+
for t in transcript_list:
|
| 295 |
+
transcript = t
|
| 296 |
+
break
|
| 297 |
+
|
| 298 |
+
if transcript is None:
|
| 299 |
+
return ""
|
| 300 |
+
|
| 301 |
+
snippets = transcript.fetch()
|
| 302 |
+
lines = [s.text for s in snippets if hasattr(s, 'text')]
|
| 303 |
+
if not lines:
|
| 304 |
+
# Fallback for dict-style response
|
| 305 |
+
lines = [s["text"] for s in snippets if isinstance(s, dict) and "text" in s]
|
| 306 |
+
return "\n".join(lines)
|
| 307 |
+
except Exception as e:
|
| 308 |
+
logger.error(f"YouTube transcript error: {e}")
|
| 309 |
+
return ""
|
| 310 |
+
|
| 311 |
+
|
| 312 |
+
async def fetch_youtube_transcript(video_id: str, max_chars: int = 4000) -> str:
|
| 313 |
+
"""Async wrapper for YouTube transcript fetching."""
|
| 314 |
+
loop = asyncio.get_event_loop()
|
| 315 |
+
text = await loop.run_in_executor(None, partial(_fetch_transcript, video_id))
|
| 316 |
+
|
| 317 |
+
if not text:
|
| 318 |
+
return ""
|
| 319 |
+
|
| 320 |
+
if len(text) > max_chars:
|
| 321 |
+
text = text[:max_chars] + "\n...(μ΄ν μλ΅)"
|
| 322 |
+
|
| 323 |
+
return text
|
| 324 |
+
|
| 325 |
+
|
| 326 |
+
def extract_youtube_id(text: str) -> str | None:
|
| 327 |
+
"""Extract YouTube video ID from text."""
|
| 328 |
+
m = YOUTUBE_PATTERN.search(text)
|
| 329 |
+
return m.group(1) if m else None
|
| 330 |
+
|
| 331 |
+
|
| 332 |
# ===== Message Routing & LLM =====
|
| 333 |
|
| 334 |
def detect_message_type(text: str) -> str:
|
| 335 |
"""Determine how to handle the message."""
|
| 336 |
if text.startswith("/"):
|
| 337 |
return "command"
|
| 338 |
+
if extract_youtube_id(text):
|
| 339 |
+
return "youtube"
|
| 340 |
if URL_PATTERN.search(text):
|
| 341 |
return "url"
|
| 342 |
if text.startswith("/search") or extract_search_query(text):
|
|
|
|
| 399 |
"κΈ°λ₯:\n"
|
| 400 |
"- λ©μμ§λ₯Ό 보λ΄λ©΄ AIκ° λ΅λ³\n"
|
| 401 |
"- URLμ 보λ΄λ©΄ μλ μμ½\n"
|
| 402 |
+
"- YouTube URLμ 보λ΄λ©΄ μμ μμ½\n"
|
| 403 |
"- 'κ²μ' ν€μλ ν¬ν¨ μ μΉ κ²μ\n"
|
| 404 |
"- /search <κ²μμ΄> - μΉ κ²μ\n\n"
|
| 405 |
"μ€μ :\n"
|
|
|
|
| 430 |
f"μ΄ λ©μμ§: {stats['total_messages']}κ°\n"
|
| 431 |
f"첫 λν: {stats['first'] or 'μμ'}\n"
|
| 432 |
f"μ΅κ·Ό λν: {stats['last'] or 'μμ'}\n\n"
|
| 433 |
+
f"κΈ°λ₯: LLM λν, μΉ κ²μ, URL μμ½, YouTube μμ½, μꡬ κΈ°μ΅")
|
| 434 |
|
| 435 |
if cmd == "/help":
|
| 436 |
return reply_msg(chat_id,
|
|
|
|
| 438 |
"λν: λ©μμ§λ₯Ό 보λ΄λ©΄ AIκ° λ΅λ³ν©λλ€.\n"
|
| 439 |
"κ²μ: 'λΉνΈμ½μΈ μμΈ κ²μν΄μ€' λλ /search λΉνΈμ½μΈ μμΈ\n"
|
| 440 |
"URL μμ½: URLμ 보λ΄λ©΄ μλμΌλ‘ λ΄μ© μμ½\n"
|
| 441 |
+
"YouTube μμ½: YouTube URLμ 보λ΄λ©΄ μλ§ κΈ°λ° μμ μμ½\n"
|
| 442 |
"κΈ°μ΅: λν λ΄μ©μ κΈ°μ΅ν©λλ€ (μλ² μ¬μμ νμλ μ μ§)\n\n"
|
| 443 |
"λͺ
λ Ήμ΄:\n"
|
| 444 |
"/search <κ²μμ΄> - μΉ κ²μ\n"
|
|
|
|
| 496 |
return reply_msg(chat_id, llm_response)
|
| 497 |
|
| 498 |
|
| 499 |
+
async def handle_youtube(chat_id: int, text: str) -> JSONResponse:
|
| 500 |
+
"""YouTube video transcript + LLM summary."""
|
| 501 |
+
video_id = extract_youtube_id(text)
|
| 502 |
+
if not video_id:
|
| 503 |
+
return reply_msg(chat_id, "YouTube URLμ μΈμν μ μμ΅λλ€.")
|
| 504 |
+
|
| 505 |
+
await save_message(chat_id, "user", text)
|
| 506 |
+
transcript = await fetch_youtube_transcript(video_id)
|
| 507 |
+
|
| 508 |
+
if not transcript:
|
| 509 |
+
# Fallback: treat as regular URL
|
| 510 |
+
url_match = URL_PATTERN.search(text)
|
| 511 |
+
if url_match:
|
| 512 |
+
content = await fetch_url_content(url_match.group())
|
| 513 |
+
if content and "μ€ν¨" not in content and "μΆμΆν μ μ" not in content:
|
| 514 |
+
prompt = "μλ YouTube νμ΄μ§ λ΄μ©μ λ°νμΌλ‘ μμμ ν΅μ¬ λ΄μ©μ μμ½ν΄μ€."
|
| 515 |
+
llm_response = await generate_llm_response(chat_id, prompt, extra_context=content)
|
| 516 |
+
await save_message(chat_id, "assistant", llm_response)
|
| 517 |
+
return reply_msg(chat_id, llm_response)
|
| 518 |
+
return reply_msg(chat_id,
|
| 519 |
+
"μ΄ μμμ μλ§(CC)μ΄ μμ΄ μμ½ν μ μμ΅λλ€.\n"
|
| 520 |
+
"μλ§μ΄ μλ μμλ§ μμ½ κ°λ₯ν©λλ€.")
|
| 521 |
+
|
| 522 |
+
# Determine user intent
|
| 523 |
+
clean_text = URL_PATTERN.sub("", text).strip()
|
| 524 |
+
clean_text = re.sub(r'https?://youtu\.be/[a-zA-Z0-9_-]+', '', clean_text).strip()
|
| 525 |
+
if clean_text:
|
| 526 |
+
prompt = f"{clean_text}\n\nμ μμ²μ λν΄ μλ YouTube μμ μλ§μ μ°Έκ³ ν΄μ λ΅λ³ν΄μ€."
|
| 527 |
+
else:
|
| 528 |
+
prompt = (
|
| 529 |
+
"μλ YouTube μμ μλ§μ λΆμνμ¬ ν΅μ¬ μμ½ν΄μ€.\n"
|
| 530 |
+
"κ·μΉ:\n"
|
| 531 |
+
"1. μμμ μ£Όμ μ ν΅μ¬ λ©μμ§λ₯Ό λ¨Όμ ν μ€λ‘ μ 리\n"
|
| 532 |
+
"2. μ£Όμ λ΄μ©μ 3~5κ° ν¬μΈνΈλ‘ μμ½\n"
|
| 533 |
+
"3. ν΅μ¬ μΈμ¬μ΄νΈλ κ²°λ‘ μ΄ μμΌλ©΄ λ§μ§λ§μ μ 리\n"
|
| 534 |
+
"4. μλ§ μλ¬Έμ κ·Έλλ‘ μΈμ©νμ§ λ§κ³ ν΅μ¬λ§ μ 리"
|
| 535 |
+
)
|
| 536 |
+
|
| 537 |
+
llm_response = await generate_llm_response(chat_id, prompt, extra_context=transcript)
|
| 538 |
+
await save_message(chat_id, "assistant", llm_response)
|
| 539 |
+
return reply_msg(chat_id, f"π¬ YouTube μμ½\n\n{llm_response}")
|
| 540 |
+
|
| 541 |
+
|
| 542 |
async def handle_chat(chat_id: int, text: str) -> JSONResponse:
|
| 543 |
"""Normal LLM chat with history."""
|
| 544 |
# Check for search triggers
|
|
|
|
| 599 |
return HTMLResponse(
|
| 600 |
"<html><body style='font-family:system-ui;max-width:600px;margin:60px auto;padding:0 20px'>"
|
| 601 |
"<h1>Clawdbot AI Secretary</h1>"
|
| 602 |
+
"<p>Telegram bot + Web Search + URL Reader + YouTube Summary + Persistent Memory</p>"
|
| 603 |
"<p style='color:green;font-weight:bold'>ONLINE</p>"
|
| 604 |
"<h3>Features</h3><ul>"
|
| 605 |
"<li>LLM Chat (5 models)</li>"
|
| 606 |
"<li>DuckDuckGo Web Search</li>"
|
| 607 |
"<li>URL Content Summarizer</li>"
|
| 608 |
+
"<li>YouTube Video Summarizer</li>"
|
| 609 |
"<li>SQLite Persistent Memory</li>"
|
| 610 |
"</ul></body></html>")
|
| 611 |
|
| 612 |
|
| 613 |
@app.get("/health")
|
| 614 |
async def health():
|
| 615 |
+
return {"status": "ok", "features": ["chat", "search", "url", "youtube", "memory"]}
|
| 616 |
|
| 617 |
|
| 618 |
@app.post("/webhook")
|
|
|
|
| 636 |
msg_type = detect_message_type(text)
|
| 637 |
if msg_type == "command":
|
| 638 |
return await handle_command(chat_id, text)
|
| 639 |
+
if msg_type == "youtube":
|
| 640 |
+
return await handle_youtube(chat_id, text)
|
| 641 |
if msg_type == "url":
|
| 642 |
return await handle_url(chat_id, text)
|
| 643 |
return await handle_chat(chat_id, text)
|