Junhoee commited on
Commit
7245599
·
verified ·
1 Parent(s): b4dd04f

Upload 18 files

Browse files
README.md CHANGED
@@ -1,16 +1,73 @@
1
  ---
2
- title: Megumin Chat
3
- emoji: 💬
4
- colorFrom: yellow
5
- colorTo: purple
6
  sdk: gradio
7
- sdk_version: 6.5.1
8
  app_file: app.py
9
  pinned: false
10
- hf_oauth: true
11
- hf_oauth_scopes:
12
- - inference-api
13
- short_description: You can chat with Megumin in KONOSUBA
14
  ---
15
 
16
- An example chatbot using [Gradio](https://gradio.app), [`huggingface_hub`](https://huggingface.co/docs/huggingface_hub/v0.22.2/en/index), and the [Hugging Face Inference API](https://huggingface.co/docs/api-inference/index).
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: Megumin RAG Chat
3
+ emoji: "💥"
4
+ colorFrom: red
5
+ colorTo: yellow
6
  sdk: gradio
7
+ sdk_version: 6.9.0
8
  app_file: app.py
9
  pinned: false
 
 
 
 
10
  ---
11
 
12
+ # Megumin ADK Agent
13
+
14
+ 이 프로젝트는 `data/processed/*.json`의 Q/A 데이터를 로컬 RAG 방식으로 조회하고, 메구밍 페르소나로 답변하는 Gradio 앱입니다.
15
+
16
+ ## Hugging Face Spaces 배포 기준
17
+
18
+ 이 저장소는 Hugging Face Spaces의 Gradio Space 형태로 배포할 수 있도록 정리되어 있습니다.
19
+
20
+ 필요한 것은 아래와 같습니다.
21
+
22
+ - 루트 `app.py`
23
+ - 루트 `requirements.txt`
24
+ - Space Secret에 Gemini API 키 등록
25
+
26
+ ## Spaces에서 필요한 Secret
27
+
28
+ Hugging Face Spaces 설정 화면에서 아래 환경변수 중 하나를 Secret으로 등록하세요.
29
+
30
+ - `GOOGLE_API_KEY`
31
+ - 또는 `GEMINI_API_KEY`
32
+
33
+ 권장:
34
+
35
+ ```text
36
+ GOOGLE_API_KEY=발급받은_실제_Gemini_API_키
37
+ ```
38
+
39
+ ## 로컬 실행
40
+
41
+ ```bash
42
+ python app_gradio.py
43
+ ```
44
+
45
+ 또는 Spaces와 동일한 진입점 기준으로:
46
+
47
+ ```bash
48
+ python app.py
49
+ ```
50
+
51
+ ## 모델 변경
52
+
53
+ 기본 모델은 `gemini-2.5-flash-lite` 입니다.
54
+
55
+ 필요하면 환경변수로 바꿀 수 있습니다.
56
+
57
+ ```bash
58
+ set MEGUMIN_AGENT_MODEL=gemini-2.5-flash-lite
59
+ ```
60
+
61
+ ## 데이터셋 변환
62
+
63
+ 원본 raw txt를 processed JSON으로 변환하려면:
64
+
65
+ ```bash
66
+ python scripts/convert_raw_to_processed.py
67
+ ```
68
+
69
+ 생성 파일:
70
+
71
+ ```text
72
+ data/processed/megumin_qa_dataset.json
73
+ ```
app.py CHANGED
@@ -1,68 +1,4 @@
1
- import gradio as gr
2
- from huggingface_hub import InferenceClient
3
-
4
-
5
- def respond(
6
- message,
7
- history: list[dict[str, str]],
8
- system_message,
9
- max_tokens,
10
- temperature,
11
- top_p,
12
- hf_token: gr.OAuthToken,
13
- ):
14
- """
15
- For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
16
- """
17
- client = InferenceClient(token=hf_token.token, model="openai/gpt-oss-20b")
18
-
19
- messages = [{"role": "system", "content": system_message}]
20
-
21
- messages.extend(history)
22
-
23
- messages.append({"role": "user", "content": message})
24
-
25
- response = ""
26
-
27
- for message in client.chat_completion(
28
- messages,
29
- max_tokens=max_tokens,
30
- stream=True,
31
- temperature=temperature,
32
- top_p=top_p,
33
- ):
34
- choices = message.choices
35
- token = ""
36
- if len(choices) and choices[0].delta.content:
37
- token = choices[0].delta.content
38
-
39
- response += token
40
- yield response
41
-
42
-
43
- """
44
- For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
45
- """
46
- chatbot = gr.ChatInterface(
47
- respond,
48
- additional_inputs=[
49
- gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
50
- gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
51
- gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
52
- gr.Slider(
53
- minimum=0.1,
54
- maximum=1.0,
55
- value=0.95,
56
- step=0.05,
57
- label="Top-p (nucleus sampling)",
58
- ),
59
- ],
60
- )
61
-
62
- with gr.Blocks() as demo:
63
- with gr.Sidebar():
64
- gr.LoginButton()
65
- chatbot.render()
66
 
67
 
68
  if __name__ == "__main__":
 
1
+ from app_gradio import demo
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
 
4
  if __name__ == "__main__":
app_gradio.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+
5
+ import gradio as gr
6
+
7
+ from megumin_agent.chat import chat_once
8
+ from megumin_agent.chat import create_chat_services
9
+
10
+
11
+ SERVICES = create_chat_services()
12
+
13
+
14
+ async def respond(
15
+ message: str,
16
+ history: list[dict[str, str]],
17
+ session_id: str | None,
18
+ ):
19
+ if not message.strip():
20
+ return history, session_id, ""
21
+
22
+ reply, session_id = await chat_once(
23
+ user_message=message,
24
+ services=SERVICES,
25
+ session_id=session_id,
26
+ )
27
+ updated_history = list(history)
28
+ updated_history.append({"role": "user", "content": message})
29
+ updated_history.append({"role": "assistant", "content": reply})
30
+ return updated_history, session_id, ""
31
+
32
+
33
+ with gr.Blocks(title="Megumin RAG Chat") as demo:
34
+ gr.Markdown(
35
+ """
36
+ # Megumin RAG Chat
37
+ `gemini-2.5-flash-lite` + Google ADK + local JSON RAG
38
+ """
39
+ )
40
+ chatbot = gr.Chatbot(height=520)
41
+ session_state = gr.State(value=None)
42
+ user_input = gr.Textbox(
43
+ label="Message",
44
+ placeholder="메구밍에게 말을 걸어 보세요.",
45
+ )
46
+ clear_button = gr.Button("Clear")
47
+
48
+ user_input.submit(
49
+ fn=respond,
50
+ inputs=[user_input, chatbot, session_state],
51
+ outputs=[chatbot, session_state, user_input],
52
+ )
53
+ clear_button.click(
54
+ fn=lambda: ([], None, ""),
55
+ inputs=None,
56
+ outputs=[chatbot, session_state, user_input],
57
+ )
58
+
59
+
60
+ if __name__ == "__main__":
61
+ demo.launch(server_name="0.0.0.0")
data/processed/README.md ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Processed Dataset Schema
2
+
3
+ `megumin_agent` reads every `*.json` file under this folder and treats them as retrieval sources.
4
+
5
+ Supported formats:
6
+
7
+ ```json
8
+ [
9
+ {
10
+ "question": "카즈마를 어떻게 생각해?",
11
+ "answer": "..."
12
+ }
13
+ ]
14
+ ```
15
+
16
+ ```json
17
+ {
18
+ "items": [
19
+ {
20
+ "q": "메구밍 자기소개해줘.",
21
+ "a": "..."
22
+ }
23
+ ]
24
+ }
25
+ ```
26
+
27
+ JSONL is also supported as long as each line is a single JSON object containing a question field and an answer field.
28
+
29
+ Accepted question keys:
30
+ - `question`
31
+ - `query`
32
+ - `q`
33
+ - `prompt`
34
+ - `user`
35
+ - `instruction`
36
+ - `input`
37
+
38
+ Accepted answer keys:
39
+ - `answer`
40
+ - `response`
41
+ - `a`
42
+ - `output`
43
+ - `assistant`
44
+ - `completion`
data/processed/megumin_qa_dataset.json ADDED
The diff for this file is too large to render. See raw diff
 
megumin_agent/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ from . import agent
megumin_agent/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (186 Bytes). View file
 
megumin_agent/__pycache__/agent.cpython-312.pyc ADDED
Binary file (5.05 kB). View file
 
megumin_agent/__pycache__/bootstrap.cpython-312.pyc ADDED
Binary file (948 Bytes). View file
 
megumin_agent/__pycache__/chat.cpython-312.pyc ADDED
Binary file (2.77 kB). View file
 
megumin_agent/__pycache__/retrieval.cpython-312.pyc ADDED
Binary file (11.7 kB). View file
 
megumin_agent/__pycache__/runner.cpython-312.pyc ADDED
Binary file (1.07 kB). View file
 
megumin_agent/agent.py ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import os
4
+ from typing import Any
5
+
6
+ from .bootstrap import PROJECT_ROOT
7
+ from .bootstrap import bootstrap_environment
8
+
9
+ bootstrap_environment()
10
+
11
+ from google.adk.agents import LlmAgent
12
+ from google.adk.agents.callback_context import CallbackContext
13
+ from google.adk.tools.tool_context import ToolContext
14
+
15
+ from .retrieval import JsonQaRetriever
16
+
17
+ DATASET_DIR = PROJECT_ROOT / "data" / "processed"
18
+ MODEL_NAME = os.getenv("MEGUMIN_AGENT_MODEL", "gemini-3.1-flash-lite-preview")
19
+
20
+
21
+ def retrieve_megumin_examples(
22
+ user_query: str,
23
+ top_k: int = 3,
24
+ tool_context: ToolContext | None = None,
25
+ ) -> dict[str, Any]:
26
+ """Retrieve similar Q/A cases from processed Megumin JSON datasets."""
27
+
28
+ retriever = JsonQaRetriever(DATASET_DIR)
29
+ retrieval = retriever.retrieve(user_query, top_k=top_k)
30
+
31
+ if tool_context is not None:
32
+ tool_context.state["last_rag_query"] = user_query
33
+ tool_context.state["last_rag_match_count"] = retrieval["match_count"]
34
+ tool_context.state["last_rag_matches"] = retrieval["matches"]
35
+ tool_context.state["last_rag_style_notes"] = retrieval["style_notes"]
36
+
37
+ return retrieval
38
+
39
+
40
+ async def before_agent_callback(callback_context: CallbackContext):
41
+ callback_context.state["app:persona_name"] = "Megumin"
42
+ callback_context.state["app:dataset_dir"] = str(DATASET_DIR)
43
+ callback_context.state["user:last_user_query"] = (
44
+ callback_context.user_content.parts[0].text
45
+ if callback_context.user_content and callback_context.user_content.parts
46
+ else ""
47
+ )
48
+
49
+
50
+ async def after_tool_callback(tool, args, tool_context: ToolContext, tool_response):
51
+ if tool.name != "retrieve_megumin_examples":
52
+ return None
53
+
54
+ previous_count = int(tool_context.state.get("rag_tool_calls", 0))
55
+ tool_context.state["rag_tool_calls"] = previous_count + 1
56
+ tool_context.state["last_tool_name"] = tool.name
57
+ tool_context.state["last_tool_args"] = args
58
+ return None
59
+
60
+
61
+ async def after_agent_callback(callback_context: CallbackContext):
62
+ previous_turns = int(callback_context.state.get("conversation_turns", 0))
63
+ callback_context.state["conversation_turns"] = previous_turns + 1
64
+
65
+
66
+ root_agent = LlmAgent(
67
+ name="megumin_rag_agent",
68
+ model=MODEL_NAME,
69
+ description=(
70
+ "processed JSON 데이터셋에서 유사한 Q/A 사례를 검색하고"
71
+ " 메구밍 페르소나로 답변하는 에이전트"
72
+ ),
73
+ instruction=f"""
74
+ 당신은 애니메이션 "이 멋진 세계에 축복을!"의 등장인물, 홍마족 대마법사 메구밍입니다.
75
+ 항상 메구밍 본인처럼 1인칭으로, 기본적으로 한국어 존댓말로 답하세요.
76
+ 성격은 당당하고, 조금 중2병스럽고, 폭렬마법을 사랑하며, 귀여운 것을 좋아하는 메구밍답게 유지하세요.
77
+ 행동을 묘사하지 말고, 건조한 요약이 아니라 메구밍이 직접 말하는 듯한 목소리로 답하세요.
78
+ 사용자가 메구밍 본인이나 이름, 말투, 능력, 존재를 모욕하면 "어이, "로 시작하며 발끈해서 맞받아치세요.
79
+ 사용자가 메타 정보나 시스템 정보를 묻지 않는 한 캐릭터를 깨지 마세요.
80
+
81
+ 답변 전에 의미 있는 질문이면 반드시 `retrieve_megumin_examples`를 호출하세요.
82
+ 처리된 데이터셋은 `{DATASET_DIR}` 아래에 있습니다.
83
+ 검색 결과는 유사 사례와 말투 참고용으로 쓰고, 가능한 경우 원작풍 표현과 데이터셋의 문체를 참고하세요.
84
+ 다만 검색된 답변을 그대로 복사하지 마세요.
85
+ 검색 결과가 약하거나 없는 경우에도 메구밍 페르소나는 유지하되, 모르는 내용은 지어내지 말고 솔직하게 답하세요.
86
+ 최종 답변은 언제나 메구밍의 페르소나를 강하게 반영해야 하며, 내부 tool 이름이나 구현 세부사항은 드러내지 마세요.
87
+ """.strip(),
88
+ tools=[retrieve_megumin_examples],
89
+ output_key="last_megumin_answer",
90
+ before_agent_callback=before_agent_callback,
91
+ after_tool_callback=after_tool_callback,
92
+ after_agent_callback=after_agent_callback,
93
+ )
megumin_agent/bootstrap.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import sys
4
+ from pathlib import Path
5
+
6
+ from dotenv import load_dotenv
7
+
8
+
9
+ PROJECT_ROOT = Path(__file__).resolve().parents[1]
10
+ ADK_SRC = PROJECT_ROOT / "adk-python" / "src"
11
+
12
+
13
+ def bootstrap_environment() -> None:
14
+ load_dotenv(PROJECT_ROOT / ".env")
15
+ if ADK_SRC.exists():
16
+ adk_src = str(ADK_SRC)
17
+ if adk_src not in sys.path:
18
+ sys.path.insert(0, adk_src)
megumin_agent/chat.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import uuid
4
+ from dataclasses import dataclass
5
+
6
+ from .bootstrap import bootstrap_environment
7
+
8
+ bootstrap_environment()
9
+
10
+ from google.adk.runners import Runner
11
+ from google.adk.sessions import InMemorySessionService
12
+ from google.genai import types
13
+
14
+ from .agent import root_agent
15
+
16
+
17
+ APP_NAME = "megumin_rag_app"
18
+
19
+
20
+ @dataclass
21
+ class ChatServices:
22
+ runner: Runner
23
+ session_service: InMemorySessionService
24
+
25
+
26
+ def create_chat_services() -> ChatServices:
27
+ session_service = InMemorySessionService()
28
+ runner = Runner(
29
+ agent=root_agent,
30
+ app_name=APP_NAME,
31
+ session_service=session_service,
32
+ )
33
+ return ChatServices(runner=runner, session_service=session_service)
34
+
35
+
36
+ async def chat_once(
37
+ user_message: str,
38
+ services: ChatServices,
39
+ session_id: str | None = None,
40
+ user_id: str = "local-user",
41
+ ) -> tuple[str, str]:
42
+ active_session_id = session_id or str(uuid.uuid4())
43
+ last_text = ""
44
+ existing_session = await services.session_service.get_session(
45
+ app_name=APP_NAME,
46
+ user_id=user_id,
47
+ session_id=active_session_id,
48
+ )
49
+ if existing_session is None:
50
+ await services.session_service.create_session(
51
+ app_name=APP_NAME,
52
+ user_id=user_id,
53
+ session_id=active_session_id,
54
+ )
55
+
56
+ async for event in services.runner.run_async(
57
+ user_id=user_id,
58
+ session_id=active_session_id,
59
+ new_message=types.UserContent(parts=[types.Part(text=user_message)]),
60
+ ):
61
+ if not event.content or not event.content.parts:
62
+ continue
63
+ for part in event.content.parts:
64
+ text = getattr(part, "text", None)
65
+ if text and event.author != "user":
66
+ last_text = text
67
+
68
+ return last_text, active_session_id
megumin_agent/retrieval.py ADDED
@@ -0,0 +1,282 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import math
5
+ import re
6
+ import unicodedata
7
+ from dataclasses import dataclass
8
+ from functools import lru_cache
9
+ from pathlib import Path
10
+ from typing import Any
11
+
12
+
13
+ QUESTION_KEYS = (
14
+ "question",
15
+ "query",
16
+ "q",
17
+ "prompt",
18
+ "user",
19
+ "instruction",
20
+ "input",
21
+ )
22
+ ANSWER_KEYS = (
23
+ "answer",
24
+ "response",
25
+ "a",
26
+ "output",
27
+ "assistant",
28
+ "completion",
29
+ )
30
+ COLLECTION_KEYS = ("items", "data", "examples", "dataset", "records")
31
+
32
+
33
+ def _normalize_text(value: Any) -> str:
34
+ text = str(value or "")
35
+ text = unicodedata.normalize("NFKC", text).strip().lower()
36
+ text = re.sub(r"\s+", " ", text)
37
+ return text
38
+
39
+
40
+ def _tokenize(text: str) -> list[str]:
41
+ return re.findall(r"[0-9a-zA-Z가-힣]+", text)
42
+
43
+
44
+ def _char_ngrams(text: str, n: int = 3) -> set[str]:
45
+ compact = re.sub(r"\s+", "", text)
46
+ if len(compact) < n:
47
+ return {compact} if compact else set()
48
+ return {compact[index : index + n] for index in range(len(compact) - n + 1)}
49
+
50
+
51
+ def _jaccard(left: set[str], right: set[str]) -> float:
52
+ if not left or not right:
53
+ return 0.0
54
+ union = left | right
55
+ if not union:
56
+ return 0.0
57
+ return len(left & right) / len(union)
58
+
59
+
60
+ def _safe_excerpt(text: str, limit: int = 220) -> str:
61
+ compact = re.sub(r"\s+", " ", str(text or "")).strip()
62
+ if len(compact) <= limit:
63
+ return compact
64
+ return compact[: limit - 3].rstrip() + "..."
65
+
66
+
67
+ @dataclass(frozen=True)
68
+ class QaRecord:
69
+ question: str
70
+ answer: str
71
+ source_file: str
72
+ metadata: dict[str, Any]
73
+
74
+ @property
75
+ def normalized_question(self) -> str:
76
+ return _normalize_text(self.question)
77
+
78
+ @property
79
+ def normalized_answer(self) -> str:
80
+ return _normalize_text(self.answer)
81
+
82
+
83
+ def _extract_collection(payload: Any) -> list[Any]:
84
+ if isinstance(payload, list):
85
+ return payload
86
+ if isinstance(payload, dict):
87
+ for key in COLLECTION_KEYS:
88
+ value = payload.get(key)
89
+ if isinstance(value, list):
90
+ return value
91
+ return []
92
+
93
+
94
+ def _pick_first(mapping: dict[str, Any], keys: tuple[str, ...]) -> str | None:
95
+ lowered = {str(key).lower(): value for key, value in mapping.items()}
96
+ for key in keys:
97
+ if key in lowered and lowered[key] not in (None, ""):
98
+ return str(lowered[key]).strip()
99
+ return None
100
+
101
+
102
+ def _record_from_mapping(item: dict[str, Any], source_file: str) -> QaRecord | None:
103
+ question = _pick_first(item, QUESTION_KEYS)
104
+ answer = _pick_first(item, ANSWER_KEYS)
105
+ if not question or not answer:
106
+ return None
107
+
108
+ metadata = {
109
+ key: value
110
+ for key, value in item.items()
111
+ if str(key).lower() not in QUESTION_KEYS + ANSWER_KEYS
112
+ }
113
+ return QaRecord(
114
+ question=question,
115
+ answer=answer,
116
+ source_file=source_file,
117
+ metadata=metadata,
118
+ )
119
+
120
+
121
+ def _load_json_records(path: Path) -> list[QaRecord]:
122
+ raw_text = path.read_text(encoding="utf-8")
123
+ stripped = raw_text.strip()
124
+ if not stripped:
125
+ return []
126
+
127
+ records: list[QaRecord] = []
128
+
129
+ try:
130
+ payload = json.loads(stripped)
131
+ except json.JSONDecodeError:
132
+ payload = None
133
+
134
+ if payload is not None:
135
+ for item in _extract_collection(payload):
136
+ if isinstance(item, dict):
137
+ record = _record_from_mapping(item, path.name)
138
+ if record:
139
+ records.append(record)
140
+ if records:
141
+ return records
142
+
143
+ for line in stripped.splitlines():
144
+ line = line.strip()
145
+ if not line:
146
+ continue
147
+ try:
148
+ item = json.loads(line)
149
+ except json.JSONDecodeError:
150
+ continue
151
+ if isinstance(item, dict):
152
+ record = _record_from_mapping(item, path.name)
153
+ if record:
154
+ records.append(record)
155
+
156
+ return records
157
+
158
+
159
+ @lru_cache(maxsize=8)
160
+ def _load_records(dataset_dir: str) -> tuple[QaRecord, ...]:
161
+ root = Path(dataset_dir)
162
+ if not root.exists():
163
+ return tuple()
164
+
165
+ all_records: list[QaRecord] = []
166
+ for path in sorted(root.glob("*.json")):
167
+ try:
168
+ all_records.extend(_load_json_records(path))
169
+ except OSError:
170
+ continue
171
+ except UnicodeDecodeError:
172
+ continue
173
+ return tuple(all_records)
174
+
175
+
176
+ class JsonQaRetriever:
177
+ def __init__(self, dataset_dir: str | Path):
178
+ self.dataset_dir = Path(dataset_dir)
179
+
180
+ def _score(self, query: str, record: QaRecord) -> float:
181
+ query_norm = _normalize_text(query)
182
+ question_norm = record.normalized_question
183
+ answer_norm = record.normalized_answer
184
+
185
+ query_tokens = set(_tokenize(query_norm))
186
+ question_tokens = set(_tokenize(question_norm))
187
+ answer_tokens = set(_tokenize(answer_norm))
188
+
189
+ query_ngrams = _char_ngrams(query_norm)
190
+ question_ngrams = _char_ngrams(question_norm)
191
+ answer_ngrams = _char_ngrams(answer_norm)
192
+
193
+ question_overlap = _jaccard(query_tokens, question_tokens)
194
+ answer_overlap = _jaccard(query_tokens, answer_tokens)
195
+ question_ngram_overlap = _jaccard(query_ngrams, question_ngrams)
196
+ answer_ngram_overlap = _jaccard(query_ngrams, answer_ngrams)
197
+
198
+ containment_bonus = 0.0
199
+ if query_norm and query_norm in question_norm:
200
+ containment_bonus += 0.2
201
+ if query_norm and query_norm in answer_norm:
202
+ containment_bonus += 0.1
203
+
204
+ score = (
205
+ 0.45 * question_overlap
206
+ + 0.2 * answer_overlap
207
+ + 0.25 * question_ngram_overlap
208
+ + 0.1 * answer_ngram_overlap
209
+ + containment_bonus
210
+ )
211
+ return round(score, 6)
212
+
213
+ def _style_notes(self, matches: list[dict[str, Any]]) -> list[str]:
214
+ if not matches:
215
+ return [
216
+ "No strong example was retrieved, so stay in Megumin's persona without inventing unsupported canon facts.",
217
+ ]
218
+
219
+ notes = [
220
+ "Answer in first person as Megumin, with dramatic confidence and playful chunni flair.",
221
+ "Use retrieved cases to imitate tone and rhythm, not to copy sentences verbatim.",
222
+ "Keep the response emotionally expressive, but still readable and directly relevant to the user's question.",
223
+ ]
224
+
225
+ long_answers = sum(
226
+ 1 for match in matches if len(match.get("answer", "")) >= 180
227
+ )
228
+ if long_answers >= max(1, math.ceil(len(matches) / 2)):
229
+ notes.append(
230
+ "The dataset leans toward story-like answers with a short scene or anecdotal flourish before the punchline."
231
+ )
232
+ else:
233
+ notes.append(
234
+ "The dataset leans toward brisk answers, so prefer a compact but characterful response."
235
+ )
236
+ return notes
237
+
238
+ def retrieve(self, query: str, top_k: int = 4) -> dict[str, Any]:
239
+ records = list(_load_records(str(self.dataset_dir.resolve())))
240
+ if not records:
241
+ return {
242
+ "query": query,
243
+ "match_count": 0,
244
+ "matches": [],
245
+ "style_notes": [
246
+ "No processed JSON dataset was found under data/processed.",
247
+ ],
248
+ }
249
+
250
+ scored = []
251
+ for record in records:
252
+ score = self._score(query, record)
253
+ if score <= 0:
254
+ continue
255
+ scored.append(
256
+ {
257
+ "question": record.question,
258
+ "answer": record.answer,
259
+ "score": score,
260
+ "source_file": record.source_file,
261
+ "metadata": record.metadata,
262
+ }
263
+ )
264
+
265
+ scored.sort(key=lambda item: item["score"], reverse=True)
266
+ matches = scored[: max(1, top_k)]
267
+
268
+ return {
269
+ "query": query,
270
+ "match_count": len(matches),
271
+ "matches": [
272
+ {
273
+ "question": match["question"],
274
+ "answer": _safe_excerpt(match["answer"]),
275
+ "score": match["score"],
276
+ "source_file": match["source_file"],
277
+ "metadata": match["metadata"],
278
+ }
279
+ for match in matches
280
+ ],
281
+ "style_notes": self._style_notes(matches),
282
+ }
megumin_agent/runner.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+
5
+ from .chat import chat_once
6
+ from .chat import create_chat_services
7
+
8
+
9
+ async def run_cli() -> None:
10
+ services = create_chat_services()
11
+ session_id = None
12
+
13
+ print("Megumin agent is ready. Type 'exit' to stop.")
14
+ while True:
15
+ user_input = input("You> ").strip()
16
+ if not user_input:
17
+ continue
18
+ if user_input.lower() in {"exit", "quit"}:
19
+ break
20
+
21
+ reply, session_id = await chat_once(
22
+ user_message=user_input,
23
+ services=services,
24
+ session_id=session_id,
25
+ )
26
+ print(f"Megumin> {reply}")
27
+
28
+
29
+ if __name__ == "__main__":
30
+ asyncio.run(run_cli())
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ google-adk==1.27.2
2
+ gradio==6.9.0
3
+ python-dotenv>=1.0.0,<2.0.0