Spaces:

Daviidkang
/

telegram-clawdbot

Running

App Files Files Community

Daviidkang commited on Feb 14

Commit

958264d

verified ·

1 Parent(s): dfa8f7d

Upload app.py with huggingface_hub

Browse files

Files changed (1) hide show

app.py +113 -3

app.py CHANGED Viewed

@@ -18,6 +18,7 @@ import aiosqlite
 import trafilatura
 from bs4 import BeautifulSoup
 from duckduckgo_search import DDGS
 from fastapi import FastAPI, Request
 from fastapi.responses import JSONResponse, HTMLResponse
 from huggingface_hub import AsyncInferenceClient
@@ -61,6 +62,9 @@ def get_system_prompt() -> str:
     )
 URL_PATTERN = re.compile(r'https?://[^\s<>"\']+')
 SEARCH_TRIGGERS = ["검색", "찾아", "알아봐", "알려줘", "조사", "search", "look up", "find"]
@@ -269,12 +273,70 @@ async def fetch_url_content(url: str, max_chars: int = 3000) -> str:
         return f"URL 읽기 실패: {str(e)[:100]}"
 # ===== Message Routing & LLM =====
 def detect_message_type(text: str) -> str:
     """Determine how to handle the message."""
     if text.startswith("/"):
         return "command"
     if URL_PATTERN.search(text):
         return "url"
     if text.startswith("/search") or extract_search_query(text):
@@ -337,6 +399,7 @@ async def handle_command(chat_id: int, text: str) -> JSONResponse:
             "기능:\n"
             "- 메시지를 보내면 AI가 답변\n"
             "- URL을 보내면 자동 요약\n"
             "- '검색' 키워드 포함 시 웹 검색\n"
             "- /search <검색어> - 웹 검색\n\n"
             "설정:\n"
@@ -367,7 +430,7 @@ async def handle_command(chat_id: int, text: str) -> JSONResponse:
             f"총 메시지: {stats['total_messages']}개\n"
             f"첫 대화: {stats['first'] or '없음'}\n"
             f"최근 대화: {stats['last'] or '없음'}\n\n"
-            f"기능: LLM 대화, 웹 검색, URL 요약, 영구 기억")
     if cmd == "/help":
         return reply_msg(chat_id,
@@ -375,6 +438,7 @@ async def handle_command(chat_id: int, text: str) -> JSONResponse:
             "대화: 메시지를 보내면 AI가 답변합니다.\n"
             "검색: '비트코인 시세 검색해줘' 또는 /search 비트코인 시세\n"
             "URL 요약: URL을 보내면 자동으로 내용 요약\n"
             "기억: 대화 내용을 기억합니다 (서버 재시작 후에도 유지)\n\n"
             "명령어:\n"
             "/search <검색어> - 웹 검색\n"
@@ -432,6 +496,49 @@ async def handle_url(chat_id: int, text: str) -> JSONResponse:
     return reply_msg(chat_id, llm_response)
 async def handle_chat(chat_id: int, text: str) -> JSONResponse:
     """Normal LLM chat with history."""
     # Check for search triggers
@@ -492,19 +599,20 @@ async def root():
     return HTMLResponse(
         "<html><body style='font-family:system-ui;max-width:600px;margin:60px auto;padding:0 20px'>"
         "<h1>Clawdbot AI Secretary</h1>"
-        "<p>Telegram bot + Web Search + URL Reader + Persistent Memory</p>"
         "<p style='color:green;font-weight:bold'>ONLINE</p>"
         "<h3>Features</h3><ul>"
         "<li>LLM Chat (5 models)</li>"
         "<li>DuckDuckGo Web Search</li>"
         "<li>URL Content Summarizer</li>"
         "<li>SQLite Persistent Memory</li>"
         "</ul></body></html>")
 @app.get("/health")
 async def health():
-    return {"status": "ok", "features": ["chat", "search", "url", "memory"]}
 @app.post("/webhook")
@@ -528,6 +636,8 @@ async def webhook(request: Request):
         msg_type = detect_message_type(text)
         if msg_type == "command":
             return await handle_command(chat_id, text)
         if msg_type == "url":
             return await handle_url(chat_id, text)
         return await handle_chat(chat_id, text)

 import trafilatura
 from bs4 import BeautifulSoup
 from duckduckgo_search import DDGS
+from youtube_transcript_api import YouTubeTranscriptApi
 from fastapi import FastAPI, Request
 from fastapi.responses import JSONResponse, HTMLResponse
 from huggingface_hub import AsyncInferenceClient
     )
 URL_PATTERN = re.compile(r'https?://[^\s<>"\']+')
+YOUTUBE_PATTERN = re.compile(
+    r'(?:https?://)?(?:www\.)?(?:youtube\.com/watch\?v=|youtu\.be/|youtube\.com/shorts/)([a-zA-Z0-9_-]{11})'
+)
 SEARCH_TRIGGERS = ["검색", "찾아", "알아봐", "알려줘", "조사", "search", "look up", "find"]
         return f"URL 읽기 실패: {str(e)[:100]}"
+# ===== YouTube Transcript =====
+def _fetch_transcript(video_id: str) -> str:
+    """Fetch YouTube transcript (Korean > English > any)."""
+    try:
+        transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
+        # Priority: Korean manual > Korean auto > English > any
+        transcript = None
+        for lang_codes in [["ko"], ["en"]]:
+            try:
+                transcript = transcript_list.find_transcript(lang_codes)
+                break
+            except Exception:
+                continue
+        if transcript is None:
+            # Fallback: first available, translated to Korean if possible
+            for t in transcript_list:
+                transcript = t
+                break
+        if transcript is None:
+            return ""
+        snippets = transcript.fetch()
+        lines = [s.text for s in snippets if hasattr(s, 'text')]
+        if not lines:
+            # Fallback for dict-style response
+            lines = [s["text"] for s in snippets if isinstance(s, dict) and "text" in s]
+        return "\n".join(lines)
+    except Exception as e:
+        logger.error(f"YouTube transcript error: {e}")
+        return ""
+async def fetch_youtube_transcript(video_id: str, max_chars: int = 4000) -> str:
+    """Async wrapper for YouTube transcript fetching."""
+    loop = asyncio.get_event_loop()
+    text = await loop.run_in_executor(None, partial(_fetch_transcript, video_id))
+    if not text:
+        return ""
+    if len(text) > max_chars:
+        text = text[:max_chars] + "\n...(이하 생략)"
+    return text
+def extract_youtube_id(text: str) -> str | None:
+    """Extract YouTube video ID from text."""
+    m = YOUTUBE_PATTERN.search(text)
+    return m.group(1) if m else None
 # ===== Message Routing & LLM =====
 def detect_message_type(text: str) -> str:
     """Determine how to handle the message."""
     if text.startswith("/"):
         return "command"
+    if extract_youtube_id(text):
+        return "youtube"
     if URL_PATTERN.search(text):
         return "url"
     if text.startswith("/search") or extract_search_query(text):
             "기능:\n"
             "- 메시지를 보내면 AI가 답변\n"
             "- URL을 보내면 자동 요약\n"
+            "- YouTube URL을 보내면 영상 요약\n"
             "- '검색' 키워드 포함 시 웹 검색\n"
             "- /search <검색어> - 웹 검색\n\n"
             "설정:\n"
             f"총 메시지: {stats['total_messages']}개\n"
             f"첫 대화: {stats['first'] or '없음'}\n"
             f"최근 대화: {stats['last'] or '없음'}\n\n"
+            f"기능: LLM 대화, 웹 검색, URL 요약, YouTube 요약, 영구 기억")
     if cmd == "/help":
         return reply_msg(chat_id,
             "대화: 메시지를 보내면 AI가 답변합니다.\n"
             "검색: '비트코인 시세 검색해줘' 또는 /search 비트코인 시세\n"
             "URL 요약: URL을 보내면 자동으로 내용 요약\n"
+            "YouTube 요약: YouTube URL을 보내면 자막 기반 영상 요약\n"
             "기억: 대화 내용을 기억합니다 (서버 재시작 후에도 유지)\n\n"
             "명령어:\n"
             "/search <검색어> - 웹 검색\n"
     return reply_msg(chat_id, llm_response)
+async def handle_youtube(chat_id: int, text: str) -> JSONResponse:
+    """YouTube video transcript + LLM summary."""
+    video_id = extract_youtube_id(text)
+    if not video_id:
+        return reply_msg(chat_id, "YouTube URL을 인식할 수 없습니다.")
+    await save_message(chat_id, "user", text)
+    transcript = await fetch_youtube_transcript(video_id)
+    if not transcript:
+        # Fallback: treat as regular URL
+        url_match = URL_PATTERN.search(text)
+        if url_match:
+            content = await fetch_url_content(url_match.group())
+            if content and "실패" not in content and "추출할 수 없" not in content:
+                prompt = "아래 YouTube 페이지 내용을 바탕으로 영상의 핵심 내용을 요약해줘."
+                llm_response = await generate_llm_response(chat_id, prompt, extra_context=content)
+                await save_message(chat_id, "assistant", llm_response)
+                return reply_msg(chat_id, llm_response)
+        return reply_msg(chat_id,
+            "이 영상은 자막(CC)이 없어 요약할 수 없습니다.\n"
+            "자막이 있는 영상만 요약 가능합니다.")
+    # Determine user intent
+    clean_text = URL_PATTERN.sub("", text).strip()
+    clean_text = re.sub(r'https?://youtu\.be/[a-zA-Z0-9_-]+', '', clean_text).strip()
+    if clean_text:
+        prompt = f"{clean_text}\n\n위 요청에 대해 아래 YouTube 영상 자막을 참고해서 답변해줘."
+    else:
+        prompt = (
+            "아래 YouTube 영상 자막을 분석하여 핵심 요약해줘.\n"
+            "규칙:\n"
+            "1. 영상의 주제와 핵심 메시지를 먼저 한 줄로 정리\n"
+            "2. 주요 내용을 3~5개 포인트로 요약\n"
+            "3. 핵심 인사이트나 결론이 있으면 마지막에 정리\n"
+            "4. 자막 원문을 그대로 인용하지 말고 핵심만 정리"
+        )
+    llm_response = await generate_llm_response(chat_id, prompt, extra_context=transcript)
+    await save_message(chat_id, "assistant", llm_response)
+    return reply_msg(chat_id, f"🎬 YouTube 요약\n\n{llm_response}")
 async def handle_chat(chat_id: int, text: str) -> JSONResponse:
     """Normal LLM chat with history."""
     # Check for search triggers
     return HTMLResponse(
         "<html><body style='font-family:system-ui;max-width:600px;margin:60px auto;padding:0 20px'>"
         "<h1>Clawdbot AI Secretary</h1>"
+        "<p>Telegram bot + Web Search + URL Reader + YouTube Summary + Persistent Memory</p>"
         "<p style='color:green;font-weight:bold'>ONLINE</p>"
         "<h3>Features</h3><ul>"
         "<li>LLM Chat (5 models)</li>"
         "<li>DuckDuckGo Web Search</li>"
         "<li>URL Content Summarizer</li>"
+        "<li>YouTube Video Summarizer</li>"
         "<li>SQLite Persistent Memory</li>"
         "</ul></body></html>")
 @app.get("/health")
 async def health():
+    return {"status": "ok", "features": ["chat", "search", "url", "youtube", "memory"]}
 @app.post("/webhook")
         msg_type = detect_message_type(text)
         if msg_type == "command":
             return await handle_command(chat_id, text)
+        if msg_type == "youtube":
+            return await handle_youtube(chat_id, text)
         if msg_type == "url":
             return await handle_url(chat_id, text)
         return await handle_chat(chat_id, text)