Daviidkang commited on
Commit
958264d
Β·
verified Β·
1 Parent(s): dfa8f7d

Upload app.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +113 -3
app.py CHANGED
@@ -18,6 +18,7 @@ import aiosqlite
18
  import trafilatura
19
  from bs4 import BeautifulSoup
20
  from duckduckgo_search import DDGS
 
21
  from fastapi import FastAPI, Request
22
  from fastapi.responses import JSONResponse, HTMLResponse
23
  from huggingface_hub import AsyncInferenceClient
@@ -61,6 +62,9 @@ def get_system_prompt() -> str:
61
  )
62
 
63
  URL_PATTERN = re.compile(r'https?://[^\s<>"\']+')
 
 
 
64
  SEARCH_TRIGGERS = ["검색", "μ°Ύμ•„", "μ•Œμ•„λ΄", "μ•Œλ €μ€˜", "쑰사", "search", "look up", "find"]
65
 
66
 
@@ -269,12 +273,70 @@ async def fetch_url_content(url: str, max_chars: int = 3000) -> str:
269
  return f"URL 읽기 μ‹€νŒ¨: {str(e)[:100]}"
270
 
271
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
272
  # ===== Message Routing & LLM =====
273
 
274
  def detect_message_type(text: str) -> str:
275
  """Determine how to handle the message."""
276
  if text.startswith("/"):
277
  return "command"
 
 
278
  if URL_PATTERN.search(text):
279
  return "url"
280
  if text.startswith("/search") or extract_search_query(text):
@@ -337,6 +399,7 @@ async def handle_command(chat_id: int, text: str) -> JSONResponse:
337
  "κΈ°λŠ₯:\n"
338
  "- λ©”μ‹œμ§€λ₯Ό 보내면 AIκ°€ λ‹΅λ³€\n"
339
  "- URL을 보내면 μžλ™ μš”μ•½\n"
 
340
  "- '검색' ν‚€μ›Œλ“œ 포함 μ‹œ μ›Ή 검색\n"
341
  "- /search <검색어> - μ›Ή 검색\n\n"
342
  "μ„€μ •:\n"
@@ -367,7 +430,7 @@ async def handle_command(chat_id: int, text: str) -> JSONResponse:
367
  f"총 λ©”μ‹œμ§€: {stats['total_messages']}개\n"
368
  f"첫 λŒ€ν™”: {stats['first'] or 'μ—†μŒ'}\n"
369
  f"졜근 λŒ€ν™”: {stats['last'] or 'μ—†μŒ'}\n\n"
370
- f"κΈ°λŠ₯: LLM λŒ€ν™”, μ›Ή 검색, URL μš”μ•½, 영ꡬ κΈ°μ–΅")
371
 
372
  if cmd == "/help":
373
  return reply_msg(chat_id,
@@ -375,6 +438,7 @@ async def handle_command(chat_id: int, text: str) -> JSONResponse:
375
  "λŒ€ν™”: λ©”μ‹œμ§€λ₯Ό 보내면 AIκ°€ λ‹΅λ³€ν•©λ‹ˆλ‹€.\n"
376
  "검색: 'λΉ„νŠΈμ½”μΈ μ‹œμ„Έ κ²€μƒ‰ν•΄μ€˜' λ˜λŠ” /search λΉ„νŠΈμ½”μΈ μ‹œμ„Έ\n"
377
  "URL μš”μ•½: URL을 보내면 μžλ™μœΌλ‘œ λ‚΄μš© μš”μ•½\n"
 
378
  "κΈ°μ–΅: λŒ€ν™” λ‚΄μš©μ„ κΈ°μ–΅ν•©λ‹ˆλ‹€ (μ„œλ²„ μž¬μ‹œμž‘ 후에도 μœ μ§€)\n\n"
379
  "λͺ…λ Ήμ–΄:\n"
380
  "/search <검색어> - μ›Ή 검색\n"
@@ -432,6 +496,49 @@ async def handle_url(chat_id: int, text: str) -> JSONResponse:
432
  return reply_msg(chat_id, llm_response)
433
 
434
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
435
  async def handle_chat(chat_id: int, text: str) -> JSONResponse:
436
  """Normal LLM chat with history."""
437
  # Check for search triggers
@@ -492,19 +599,20 @@ async def root():
492
  return HTMLResponse(
493
  "<html><body style='font-family:system-ui;max-width:600px;margin:60px auto;padding:0 20px'>"
494
  "<h1>Clawdbot AI Secretary</h1>"
495
- "<p>Telegram bot + Web Search + URL Reader + Persistent Memory</p>"
496
  "<p style='color:green;font-weight:bold'>ONLINE</p>"
497
  "<h3>Features</h3><ul>"
498
  "<li>LLM Chat (5 models)</li>"
499
  "<li>DuckDuckGo Web Search</li>"
500
  "<li>URL Content Summarizer</li>"
 
501
  "<li>SQLite Persistent Memory</li>"
502
  "</ul></body></html>")
503
 
504
 
505
  @app.get("/health")
506
  async def health():
507
- return {"status": "ok", "features": ["chat", "search", "url", "memory"]}
508
 
509
 
510
  @app.post("/webhook")
@@ -528,6 +636,8 @@ async def webhook(request: Request):
528
  msg_type = detect_message_type(text)
529
  if msg_type == "command":
530
  return await handle_command(chat_id, text)
 
 
531
  if msg_type == "url":
532
  return await handle_url(chat_id, text)
533
  return await handle_chat(chat_id, text)
 
18
  import trafilatura
19
  from bs4 import BeautifulSoup
20
  from duckduckgo_search import DDGS
21
+ from youtube_transcript_api import YouTubeTranscriptApi
22
  from fastapi import FastAPI, Request
23
  from fastapi.responses import JSONResponse, HTMLResponse
24
  from huggingface_hub import AsyncInferenceClient
 
62
  )
63
 
64
  URL_PATTERN = re.compile(r'https?://[^\s<>"\']+')
65
+ YOUTUBE_PATTERN = re.compile(
66
+ r'(?:https?://)?(?:www\.)?(?:youtube\.com/watch\?v=|youtu\.be/|youtube\.com/shorts/)([a-zA-Z0-9_-]{11})'
67
+ )
68
  SEARCH_TRIGGERS = ["검색", "μ°Ύμ•„", "μ•Œμ•„λ΄", "μ•Œλ €μ€˜", "쑰사", "search", "look up", "find"]
69
 
70
 
 
273
  return f"URL 읽기 μ‹€νŒ¨: {str(e)[:100]}"
274
 
275
 
276
+ # ===== YouTube Transcript =====
277
+
278
+ def _fetch_transcript(video_id: str) -> str:
279
+ """Fetch YouTube transcript (Korean > English > any)."""
280
+ try:
281
+ transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
282
+
283
+ # Priority: Korean manual > Korean auto > English > any
284
+ transcript = None
285
+ for lang_codes in [["ko"], ["en"]]:
286
+ try:
287
+ transcript = transcript_list.find_transcript(lang_codes)
288
+ break
289
+ except Exception:
290
+ continue
291
+
292
+ if transcript is None:
293
+ # Fallback: first available, translated to Korean if possible
294
+ for t in transcript_list:
295
+ transcript = t
296
+ break
297
+
298
+ if transcript is None:
299
+ return ""
300
+
301
+ snippets = transcript.fetch()
302
+ lines = [s.text for s in snippets if hasattr(s, 'text')]
303
+ if not lines:
304
+ # Fallback for dict-style response
305
+ lines = [s["text"] for s in snippets if isinstance(s, dict) and "text" in s]
306
+ return "\n".join(lines)
307
+ except Exception as e:
308
+ logger.error(f"YouTube transcript error: {e}")
309
+ return ""
310
+
311
+
312
+ async def fetch_youtube_transcript(video_id: str, max_chars: int = 4000) -> str:
313
+ """Async wrapper for YouTube transcript fetching."""
314
+ loop = asyncio.get_event_loop()
315
+ text = await loop.run_in_executor(None, partial(_fetch_transcript, video_id))
316
+
317
+ if not text:
318
+ return ""
319
+
320
+ if len(text) > max_chars:
321
+ text = text[:max_chars] + "\n...(μ΄ν•˜ μƒλž΅)"
322
+
323
+ return text
324
+
325
+
326
+ def extract_youtube_id(text: str) -> str | None:
327
+ """Extract YouTube video ID from text."""
328
+ m = YOUTUBE_PATTERN.search(text)
329
+ return m.group(1) if m else None
330
+
331
+
332
  # ===== Message Routing & LLM =====
333
 
334
  def detect_message_type(text: str) -> str:
335
  """Determine how to handle the message."""
336
  if text.startswith("/"):
337
  return "command"
338
+ if extract_youtube_id(text):
339
+ return "youtube"
340
  if URL_PATTERN.search(text):
341
  return "url"
342
  if text.startswith("/search") or extract_search_query(text):
 
399
  "κΈ°λŠ₯:\n"
400
  "- λ©”μ‹œμ§€λ₯Ό 보내면 AIκ°€ λ‹΅λ³€\n"
401
  "- URL을 보내면 μžλ™ μš”μ•½\n"
402
+ "- YouTube URL을 보내면 μ˜μƒ μš”μ•½\n"
403
  "- '검색' ν‚€μ›Œλ“œ 포함 μ‹œ μ›Ή 검색\n"
404
  "- /search <검색어> - μ›Ή 검색\n\n"
405
  "μ„€μ •:\n"
 
430
  f"총 λ©”μ‹œμ§€: {stats['total_messages']}개\n"
431
  f"첫 λŒ€ν™”: {stats['first'] or 'μ—†μŒ'}\n"
432
  f"졜근 λŒ€ν™”: {stats['last'] or 'μ—†μŒ'}\n\n"
433
+ f"κΈ°λŠ₯: LLM λŒ€ν™”, μ›Ή 검색, URL μš”μ•½, YouTube μš”μ•½, 영ꡬ κΈ°μ–΅")
434
 
435
  if cmd == "/help":
436
  return reply_msg(chat_id,
 
438
  "λŒ€ν™”: λ©”μ‹œμ§€λ₯Ό 보내면 AIκ°€ λ‹΅λ³€ν•©λ‹ˆλ‹€.\n"
439
  "검색: 'λΉ„νŠΈμ½”μΈ μ‹œμ„Έ κ²€μƒ‰ν•΄μ€˜' λ˜λŠ” /search λΉ„νŠΈμ½”μΈ μ‹œμ„Έ\n"
440
  "URL μš”μ•½: URL을 보내면 μžλ™μœΌλ‘œ λ‚΄μš© μš”μ•½\n"
441
+ "YouTube μš”μ•½: YouTube URL을 보내면 μžλ§‰ 기반 μ˜μƒ μš”μ•½\n"
442
  "κΈ°μ–΅: λŒ€ν™” λ‚΄μš©μ„ κΈ°μ–΅ν•©λ‹ˆλ‹€ (μ„œλ²„ μž¬μ‹œμž‘ 후에도 μœ μ§€)\n\n"
443
  "λͺ…λ Ήμ–΄:\n"
444
  "/search <검색어> - μ›Ή 검색\n"
 
496
  return reply_msg(chat_id, llm_response)
497
 
498
 
499
+ async def handle_youtube(chat_id: int, text: str) -> JSONResponse:
500
+ """YouTube video transcript + LLM summary."""
501
+ video_id = extract_youtube_id(text)
502
+ if not video_id:
503
+ return reply_msg(chat_id, "YouTube URL을 인식할 수 μ—†μŠ΅λ‹ˆλ‹€.")
504
+
505
+ await save_message(chat_id, "user", text)
506
+ transcript = await fetch_youtube_transcript(video_id)
507
+
508
+ if not transcript:
509
+ # Fallback: treat as regular URL
510
+ url_match = URL_PATTERN.search(text)
511
+ if url_match:
512
+ content = await fetch_url_content(url_match.group())
513
+ if content and "μ‹€νŒ¨" not in content and "μΆ”μΆœν•  수 μ—†" not in content:
514
+ prompt = "μ•„λž˜ YouTube νŽ˜μ΄μ§€ λ‚΄μš©μ„ λ°”νƒ•μœΌλ‘œ μ˜μƒμ˜ 핡심 λ‚΄μš©μ„ μš”μ•½ν•΄μ€˜."
515
+ llm_response = await generate_llm_response(chat_id, prompt, extra_context=content)
516
+ await save_message(chat_id, "assistant", llm_response)
517
+ return reply_msg(chat_id, llm_response)
518
+ return reply_msg(chat_id,
519
+ "이 μ˜μƒμ€ μžλ§‰(CC)이 μ—†μ–΄ μš”μ•½ν•  수 μ—†μŠ΅λ‹ˆλ‹€.\n"
520
+ "μžλ§‰μ΄ μžˆλŠ” μ˜μƒλ§Œ μš”μ•½ κ°€λŠ₯ν•©λ‹ˆλ‹€.")
521
+
522
+ # Determine user intent
523
+ clean_text = URL_PATTERN.sub("", text).strip()
524
+ clean_text = re.sub(r'https?://youtu\.be/[a-zA-Z0-9_-]+', '', clean_text).strip()
525
+ if clean_text:
526
+ prompt = f"{clean_text}\n\nμœ„ μš”μ²­μ— λŒ€ν•΄ μ•„λž˜ YouTube μ˜μƒ μžλ§‰μ„ μ°Έκ³ ν•΄μ„œ λ‹΅λ³€ν•΄μ€˜."
527
+ else:
528
+ prompt = (
529
+ "μ•„λž˜ YouTube μ˜μƒ μžλ§‰μ„ λΆ„μ„ν•˜μ—¬ 핡심 μš”μ•½ν•΄μ€˜.\n"
530
+ "κ·œμΉ™:\n"
531
+ "1. μ˜μƒμ˜ μ£Όμ œμ™€ 핡심 λ©”μ‹œμ§€λ₯Ό λ¨Όμ € ν•œ μ€„λ‘œ 정리\n"
532
+ "2. μ£Όμš” λ‚΄μš©μ„ 3~5개 포인트둜 μš”μ•½\n"
533
+ "3. 핡심 μΈμ‚¬μ΄νŠΈλ‚˜ 결둠이 있으면 λ§ˆμ§€λ§‰μ— 정리\n"
534
+ "4. μžλ§‰ 원문을 κ·ΈλŒ€λ‘œ μΈμš©ν•˜μ§€ 말고 ν•΅μ‹¬λ§Œ 정리"
535
+ )
536
+
537
+ llm_response = await generate_llm_response(chat_id, prompt, extra_context=transcript)
538
+ await save_message(chat_id, "assistant", llm_response)
539
+ return reply_msg(chat_id, f"🎬 YouTube μš”μ•½\n\n{llm_response}")
540
+
541
+
542
  async def handle_chat(chat_id: int, text: str) -> JSONResponse:
543
  """Normal LLM chat with history."""
544
  # Check for search triggers
 
599
  return HTMLResponse(
600
  "<html><body style='font-family:system-ui;max-width:600px;margin:60px auto;padding:0 20px'>"
601
  "<h1>Clawdbot AI Secretary</h1>"
602
+ "<p>Telegram bot + Web Search + URL Reader + YouTube Summary + Persistent Memory</p>"
603
  "<p style='color:green;font-weight:bold'>ONLINE</p>"
604
  "<h3>Features</h3><ul>"
605
  "<li>LLM Chat (5 models)</li>"
606
  "<li>DuckDuckGo Web Search</li>"
607
  "<li>URL Content Summarizer</li>"
608
+ "<li>YouTube Video Summarizer</li>"
609
  "<li>SQLite Persistent Memory</li>"
610
  "</ul></body></html>")
611
 
612
 
613
  @app.get("/health")
614
  async def health():
615
+ return {"status": "ok", "features": ["chat", "search", "url", "youtube", "memory"]}
616
 
617
 
618
  @app.post("/webhook")
 
636
  msg_type = detect_message_type(text)
637
  if msg_type == "command":
638
  return await handle_command(chat_id, text)
639
+ if msg_type == "youtube":
640
+ return await handle_youtube(chat_id, text)
641
  if msg_type == "url":
642
  return await handle_url(chat_id, text)
643
  return await handle_chat(chat_id, text)