Spaces:

gbrabbit
/

lily_fast_api

Sleeping

App Files Files Community

gbrabbit commited on Aug 24, 2025

Commit

0e9a45c

1 Parent(s): 1d1372e

Auto commit at 25-2025-08 3:12:15

Browse files

Files changed (30) hide show

.dockerignore +4 -0
.gitignore +1 -0
data/context/room_776_kdy.json +118 -0
data/context/room_777_kdy.json +166 -0
data/memory/memory.db +0 -0
lily_llm_api/api/routers/document_router.py +6 -1
lily_llm_api/api/routers/generation_router.py +90 -8
lily_llm_api/api/routers/multimodal_rag_router.py +8 -1
lily_llm_api/api/routers/user_memory_router.py +58 -0
lily_llm_api/app.py +10 -5
lily_llm_api/core/app_factory.py +10 -5
lily_llm_api/lily_llm_api (2).zip +3 -0
lily_llm_api/lily_llm_api (3).zip +3 -0
lily_llm_api/lily_llm_api (4).zip +3 -0
lily_llm_api/lily_llm_api (5).zip +3 -0
lily_llm_api/lily_llm_api (6).zip +3 -0
lily_llm_api/lily_llm_api.zip +3 -0
lily_llm_api/models/kanana_1_5_v_3b_instruct.py +30 -11
lily_llm_api/services/generation_service.py +101 -33
lily_llm_api/services/model_service.py +27 -1
lily_llm_api/services/session_registry.py +66 -1
lily_llm_core/lily_llm_core (2).zip +3 -0
lily_llm_core/lily_llm_core (3).zip +3 -0
lily_llm_core/lily_llm_core (4).zip +3 -0
lily_llm_core/lily_llm_core (5).zip +3 -0
lily_llm_core/lily_llm_core (6).zip +3 -0
lily_llm_core/lily_llm_core.zip +3 -0
lily_llm_core/lora_manager.py +25 -2
lily_llm_core/memory_store.py +179 -0
requirements.txt +2 -0

.dockerignore CHANGED Viewed

@@ -17,6 +17,7 @@ uploads/
 simple_stores/
 notebooks/
 lily_llm_etc/
 *.safetensors
 *.pth
@@ -90,6 +91,9 @@ data/
 uploads/
 vector_stores/
 # Backup files
 backup/
 *.backup

 simple_stores/
 notebooks/
 lily_llm_etc/
+lily_generate_project/lily_generate_package/data/
 *.safetensors
 *.pth
 uploads/
 vector_stores/
+# Ensure local runtime data under this package is excluded
+./lily_generate_project/lily_generate_package/data/
 # Backup files
 backup/
 *.backup

.gitignore CHANGED Viewed

@@ -18,6 +18,7 @@ uploads/
 simple_stores/
 notebooks/
 lily_llm_etc/
 *.safetensors
 *.pth

 simple_stores/
 notebooks/
 lily_llm_etc/
+lily_generate_project/lily_generate_package/data/
 *.safetensors
 *.pth

data/context/room_776_kdy.json ADDED Viewed

	@@ -0,0 +1,118 @@

+{
+  "export_timestamp": 1756046222.1794097,
+  "session_id": "room_776_kdy",
+  "system_prompt": "",
+  "conversation_history": [
+    {
+      "role": "user",
+      "content": "안녕 내 이름은 이철수야. 직업은 코딩강사.",
+      "timestamp": 1756045958.543232,
+      "message_id": "user_1756045958543",
+      "metadata": {
+        "session_id": "room_776_kdy"
+      },
+      "summary": "안녕 내 이름은 이철수야. 직업은 코딩강사.",
+      "tokens_estimated": 10
+    },
+    {
+      "role": "user",
+      "content": "안녕 내 이름은 이철수야. 직업은 코딩강사.",
+      "timestamp": 1756046008.1830719,
+      "message_id": "user_1756046008183",
+      "metadata": {
+        "session_id": "room_776_kdy",
+        "room_id": "776",
+        "images_used": false,
+        "num_images": 0
+      },
+      "summary": "안녕 내 이름은 이철수야. 직업은 코딩강사.",
+      "tokens_estimated": 10
+    },
+    {
+      "role": "assistant",
+      "content": "안녕하세요, 이철수 강사님! 코딩 강사로서의 경험이 정말 대단하시겠어요. 어떤 코딩 강의를 주로 가르치시나요? 초보자부터 전문가까지 다양한 수준의 학생들을 가르치시나요?",
+      "timestamp": 1756046008.1830719,
+      "message_id": "assistant_1756046008183",
+      "metadata": {
+        "session_id": "room_776_kdy",
+        "room_id": "776"
+      },
+      "summary": "안녕하세요, 이철수 강사님! 코딩 강사로서의 경험이 정말 대단하시겠어요. 어떤 코딩 강의를 주로 가르치시나요? 초보자부터 전문가까지 다양한 수준의 학생들을 가르치시나요?",
+      "tokens_estimated": 42
+    },
+    {
+      "role": "assistant",
+      "content": "안녕하세요, 이철수 강사님! 코딩 강사로서의 경험이 정말 대단하시겠어요. 어떤 코딩 강의를 주로 가르치시나요? 초보자부터 전문가까지 다양한 수준의 학생들을 가르치시나요?",
+      "timestamp": 1756046008.1993172,
+      "message_id": "assistant_1756046008199",
+      "metadata": {
+        "session_id": "room_776_kdy"
+      },
+      "summary": "안녕하세요, 이철수 강사님! 코딩 강사로서의 경험이 정말 대단하시겠어요. 어떤 코딩 강의를 주로 가르치시나요? 초보자부터 전문가까지 다양한 수준의 학생들을 가르치시나요?",
+      "tokens_estimated": 42
+    },
+    {
+      "role": "user",
+      "content": "블록코딩과 ai 에 대해 가르쳐.",
+      "timestamp": 1756046057.3309016,
+      "message_id": "user_1756046057330",
+      "metadata": {
+        "session_id": "room_776_kdy"
+      },
+      "summary": "블록코딩과 ai 에 대해 가르쳐.",
+      "tokens_estimated": 6
+    },
+    {
+      "role": "user",
+      "content": "블록코딩과 ai 에 대해 가르쳐.",
+      "timestamp": 1756046222.1559293,
+      "message_id": "user_1756046222155",
+      "metadata": {
+        "session_id": "room_776_kdy",
+        "room_id": "776",
+        "images_used": false,
+        "num_images": 0
+      },
+      "summary": "블록코딩과 ai 에 대해 가르쳐.",
+      "tokens_estimated": 6
+    },
+    {
+      "role": "assistant",
+      "content": "아, 블록코딩과 AI에 대해 가르치신다니 정말 흥미롭네요! 블록코딩은 초보자들이 코딩의 기본 개념을 쉽게 이해할 수 있도록 돕는 도구로, 스크래치나 마인크래프트",
+      "timestamp": 1756046222.1559293,
+      "message_id": "assistant_1756046222155",
+      "metadata": {
+        "session_id": "room_776_kdy",
+        "room_id": "776"
+      },
+      "summary": "아, 블록코딩과 AI에 대해 가르치신다니 정말 흥미롭네요! 블록코딩은 초보자들이 코딩의 기본 개념을 쉽게 이해할 수 있도록 돕는 도구로, 스크래치나 마인크래프트",
+      "tokens_estimated": 39
+    },
+    {
+      "role": "assistant",
+      "content": "아, 블록코딩과 AI에 대해 가르치신다니 정말 흥미롭네요! 블록코딩은 초보자들이 코딩의 기본 개념을 쉽게 이해할 수 있도록 돕는 도구로, 스크래치나 마인크래프트",
+      "timestamp": 1756046222.1730103,
+      "message_id": "assistant_1756046222173",
+      "metadata": {
+        "session_id": "room_776_kdy"
+      },
+      "summary": "아, 블록코딩과 AI에 대해 가르치신다니 정말 흥미롭네요! 블록코딩은 초보자들이 코딩의 기본 개념을 쉽게 이해할 수 있도록 돕는 도구로, 스크래치나 마인크래프트",
+      "tokens_estimated": 39
+    }
+  ],
+  "context_stats": {
+    "session_id": "room_776_kdy",
+    "total_turns": 8,
+    "user_messages": 4,
+    "assistant_messages": 4,
+    "estimated_tokens": 179,
+    "context_length": 716,
+    "memory_usage": 0.4,
+    "oldest_message": 1756045958.543232,
+    "newest_message": 1756046222.1730103,
+    "turn_summaries_count": 1,
+    "turn_summaries_tokens": 45,
+    "compression_count": 0,
+    "last_compression": null
+  }
+}

data/context/room_777_kdy.json ADDED Viewed

	@@ -0,0 +1,166 @@

+{
+  "export_timestamp": 1756054304.3968902,
+  "session_id": "room_777_kdy",
+  "system_prompt": "",
+  "conversation_history": [
+    {
+      "role": "user",
+      "content": "안녕 내 이름은 김철수야. 직업은 코딩강사.",
+      "timestamp": 1756049364.0584369,
+      "message_id": "user_1756049364058",
+      "metadata": {
+        "session_id": "room_777_kdy"
+      },
+      "summary": "안녕 내 이름은 김철수야. 직업은 코딩강사.",
+      "tokens_estimated": 10
+    },
+    {
+      "role": "user",
+      "content": "안녕 내 이름은 김철수야. 직업은 코딩강사.",
+      "timestamp": 1756049921.6687996,
+      "message_id": "user_1756049921668",
+      "metadata": {
+        "session_id": "room_777_kdy",
+        "room_id": "777",
+        "images_used": false,
+        "num_images": 0
+      },
+      "summary": "안녕 내 이름은 김철수야. 직업은 코딩강사.",
+      "tokens_estimated": 10
+    },
+    {
+      "role": "assistant",
+      "content": "안녕하세요, 김철수님! 코딩 강사로서의 경험이 정말 멋지네요. 어떤 종류의 코딩을 가르치시나요? 초보자부터 전문가까지 다양한 수준의 학생들을 가르치시나요? 또는 특정 프로그래밍 언어에",
+      "timestamp": 1756049921.6813264,
+      "message_id": "assistant_1756049921681",
+      "metadata": {
+        "session_id": "room_777_kdy",
+        "room_id": "777"
+      },
+      "summary": "안녕하세요, 김철수님! 코딩 강사로서의 경험이 정말 멋지네요. 어떤 종류의 코딩을 가르치시나요? 초보자부터 전문가까지 다양한 수준의 학생들을 가르치시나요? 또는 특정 프로그래밍 언어에",
+      "tokens_estimated": 46
+    },
+    {
+      "role": "assistant",
+      "content": "안녕하세요, 김철수님! 코딩 강사로서의 경험이 정말 멋지네요. 어떤 종류의 코딩을 가르치시나요? 초보자부터 전문가까지 다양한 수준의 학생들을 가르치시나요? 또는 특정 프로그래밍 언어에",
+      "timestamp": 1756049921.6970212,
+      "message_id": "assistant_1756049921697",
+      "metadata": {
+        "session_id": "room_777_kdy"
+      },
+      "summary": "안녕하세요, 김철수님! 코딩 강사로서의 경험이 정말 멋지네요. 어떤 종류의 코딩을 가르치시나요? 초보자부터 전문가까지 다양한 수준의 학생들을 가르치시나요? 또는 특정 프로그래밍 언어에",
+      "tokens_estimated": 46
+    },
+    {
+      "role": "user",
+      "content": "블록 코딩을 가르쳐.",
+      "timestamp": 1756050060.2033277,
+      "message_id": "user_1756050060203",
+      "metadata": {
+        "session_id": "room_777_kdy"
+      },
+      "summary": "블록 코딩을 가르쳐.",
+      "tokens_estimated": 5
+    },
+    {
+      "role": "user",
+      "content": "블록 코딩을 가르쳐.",
+      "timestamp": 1756050677.3530622,
+      "message_id": "user_1756050677348",
+      "metadata": {
+        "session_id": "room_777_kdy",
+        "room_id": "777",
+        "images_used": false,
+        "num_images": 0
+      },
+      "summary": "블록 코딩을 가르쳐.",
+      "tokens_estimated": 5
+    },
+    {
+      "role": "assistant",
+      "content": "블록 코딩은 초보자들에게 매우 유용한 학습 도구입니다. 어떤 블록 코딩 도구를 사용하고 계신가요? 예를 들어, Scratch, Blockly, 또는 다른 도구가 있을 수 있습니다. 또한, 블록 코딩을 가르칠 때 어떤 방식",
+      "timestamp": 1756050677.369919,
+      "message_id": "assistant_1756050677369",
+      "metadata": {
+        "session_id": "room_777_kdy",
+        "room_id": "777"
+      },
+      "summary": "블록 코딩은 초보자들에게 매우 유용한 학습 도구입니다. 어떤 블록 코딩 도구를 사용하고 계신가요? 예를 들어, Scratch, Blockly, 또는 다른 도구가 있을 수 있습니다. 또한, 블록 코딩을 가르칠 때 어떤 방식",
+      "tokens_estimated": 51
+    },
+    {
+      "role": "assistant",
+      "content": "블록 코딩은 초보자들에게 매우 유용한 학습 도구입니다. 어떤 블록 코딩 도구를 사용하고 계신가요? 예를 들어, Scratch, Blockly, 또는 다른 도구가 있을 수 있습니다. 또한, 블록 코딩을 가르칠 때 어떤 방식",
+      "timestamp": 1756050677.4095056,
+      "message_id": "assistant_1756050677409",
+      "metadata": {
+        "session_id": "room_777_kdy"
+      },
+      "summary": "블록 코딩은 초보자들에게 매우 유용한 학습 도구입니다. 어떤 블록 코딩 도구를 사용하고 계신가요? 예를 들어, Scratch, Blockly, 또는 다른 도구가 있을 수 있습니다. 또한, 블록 코딩을 가르칠 때 어떤 방식",
+      "tokens_estimated": 51
+    },
+    {
+      "role": "user",
+      "content": "스크래치를 가르쳐. 근데 주로 엠블럭을 사용해. 아두이노와 연동한 프로젝트가 반응이 좋거든",
+      "timestamp": 1756054249.7711434,
+      "message_id": "user_1756054249771",
+      "metadata": {
+        "session_id": "room_777_kdy"
+      },
+      "summary": "스크래치를 가르쳐. 근데 주로 엠블럭을 사용해. 아두이노와 연동한 프로젝트가 반응이 좋거든",
+      "tokens_estimated": 23
+    },
+    {
+      "role": "user",
+      "content": "스크래치를 가르쳐. 근데 주로 엠블럭을 사용해. 아두이노와 연동한 프로젝트가 반응이 좋거든",
+      "timestamp": 1756054304.3831217,
+      "message_id": "user_1756054304381",
+      "metadata": {
+        "session_id": "room_777_kdy",
+        "room_id": "777",
+        "images_used": false,
+        "num_images": 0
+      },
+      "summary": "스크래치를 가르쳐. 근데 주로 엠블럭을 사용해. 아두이노와 연동한 프로젝트가 반응이 좋거든",
+      "tokens_estimated": 23
+    },
+    {
+      "role": "assistant",
+      "content": "아두이노와 연동한 프로젝트는 학생들에게 매우 흥미로운 주제가 될 수 있습니다. 엠블럭을 사용하여 아두이노와 연동하는 프로젝트를 가르칠 때, 학생들이 쉽게 따라올 수 있도록 단계별로 설명하는 것이 중요",
+      "timestamp": 1756054304.3851252,
+      "message_id": "assistant_1756054304385",
+      "metadata": {
+        "session_id": "room_777_kdy",
+        "room_id": "777"
+      },
+      "summary": "아두이노와 연동한 프로젝트는 학생들에게 매우 흥미로운 주제가 될 수 있습니다. 엠블럭을 사용하여 아두이노와 연동하는 프로젝트를 가르칠 때, 학생들이 쉽게 따라올 수 있도록 단계별로 설명하는 것이 중요",
+      "tokens_estimated": 51
+    },
+    {
+      "role": "assistant",
+      "content": "아두이노와 연동한 프로젝트는 학생들에게 매우 흥미로운 주제가 될 수 있습니다. 엠블럭을 사용하여 아두이노와 연동하는 프로젝트를 가르칠 때, 학생들이 쉽게 따라올 수 있도록 단계별로 설명하는 것이 중요",
+      "timestamp": 1756054304.3928819,
+      "message_id": "assistant_1756054304392",
+      "metadata": {
+        "session_id": "room_777_kdy"
+      },
+      "summary": "아두이노와 연동한 프로젝트는 학생들에게 매우 흥미로운 주제가 될 수 있습니다. 엠블럭을 사용하여 아두이노와 연동하는 프로젝트를 가르칠 때, 학생들이 쉽게 따라올 수 있도록 단계별로 설명하는 것이 중요",
+      "tokens_estimated": 51
+    }
+  ],
+  "context_stats": {
+    "session_id": "room_777_kdy",
+    "total_turns": 12,
+    "user_messages": 6,
+    "assistant_messages": 6,
+    "estimated_tokens": 307,
+    "context_length": 1230,
+    "memory_usage": 0.6,
+    "oldest_message": 1756049364.0584369,
+    "newest_message": 1756054304.3928819,
+    "turn_summaries_count": 1,
+    "turn_summaries_tokens": 74,
+    "compression_count": 0,
+    "last_compression": null
+  }
+}

data/memory/memory.db ADDED Viewed

Binary file (20.5 kB). View file

lily_llm_api/api/routers/document_router.py CHANGED Viewed

@@ -12,7 +12,7 @@ from ...models.schemas import (
     DocumentUploadResponse, RAGQueryRequest, RAGQueryResponse,
     DocumentProcessResponse, MultimodalRAGResponse
 )
-from ...services.session_registry import set_user_for_room
 logger = logging.getLogger(__name__)
 router = APIRouter()
@@ -54,6 +54,11 @@ async def upload_document(
             if result.get("success"):
                 processing_time = time.time() - start_time
                 return DocumentUploadResponse(
                     success=True,
                     document_id=result.get("document_id", document_id),

     DocumentUploadResponse, RAGQueryRequest, RAGQueryResponse,
     DocumentProcessResponse, MultimodalRAGResponse
 )
+from ...services.session_registry import set_user_for_room, set_flag_for_room
 logger = logging.getLogger(__name__)
 router = APIRouter()
             if result.get("success"):
                 processing_time = time.time() - start_time
+                # 업로드 직후, 같은 방에서 다음 1회 생성은 이미지 복구를 허용
+                try:
+                    set_flag_for_room(room_id, "use_rag_images_once", True)
+                except Exception:
+                    pass
                 return DocumentUploadResponse(
                     success=True,
                     document_id=result.get("document_id", document_id),

lily_llm_api/api/routers/generation_router.py CHANGED Viewed

@@ -2,6 +2,7 @@
 Generation router for Lily LLM API
 """
 from fastapi import APIRouter, HTTPException, Request, UploadFile, File, Form, Depends
 from typing import Optional, List
 import logging
 import time
@@ -10,7 +11,7 @@ from ...models.schemas import GenerateResponse, MultimodalGenerateResponse
 from ...services.generation_service import generate_sync
 from ...services.model_service import is_model_loaded
 from ...utils.system_utils import select_model_interactive
-from ...services.session_registry import get_user_for_room, set_user_for_room, set_user_for_session
 logger = logging.getLogger(__name__)
 router = APIRouter()
@@ -29,7 +30,10 @@ async def generate(request: Request,
                   use_rag_images: bool = Form(False),
                   use_rag_text: bool = Form(False),
                   document_id: str = Form(None),
-                  image_short_side: int = Form(None)):
     if not is_model_loaded():
         raise HTTPException(status_code=503, detail="모델이 로드되지 않았습니다.")
@@ -46,11 +50,12 @@ async def generate(request: Request,
         except Exception:
             pass
-    # 세션 ID가 없으면 자동 생성 (채팅방별 고유 세션)
-    if not session_id:
-        timestamp = int(time.time())
-        session_id = f"room_{room_id}_user_{user_id}_{timestamp}"
-        print(f"🔍 [DEBUG] 자동 세션 ID 생성: {session_id} (채팅방: {room_id}, 사용자: {user_id})")
     else:
         # 제공된 세션에도 사용자 매핑 저장
         try:
@@ -61,6 +66,31 @@ async def generate(request: Request,
     if use_context:
         try:
             from lily_llm_core.context_manager import context_manager
             context_manager.add_user_message(prompt, metadata={"session_id": session_id})
             print(f"🔍 [DEBUG] 사용자 메시지 추가됨 (세션: {session_id})")
         except Exception as e:
@@ -77,8 +107,38 @@ async def generate(request: Request,
                 logger.warning(f"이미지 로드 실패: {e}")
     try:
         # generate_sync 함수 호출 (컨텍스트 포함)
-        result = generate_sync(prompt, image_data_list, use_context=use_context, session_id=session_id, user_id=user_id, room_id=room_id, use_rag_images=use_rag_images, use_rag_text=use_rag_text, document_id=document_id, image_short_side=image_short_side)
         if "error" in result:
             raise HTTPException(status_code=500, detail=result["error"])
@@ -87,6 +147,28 @@ async def generate(request: Request,
             try:
                 from lily_llm_core.context_manager import context_manager
                 context_manager.add_assistant_message(result["generated_text"], metadata={"session_id": session_id})
             except Exception as e:
                 logger.warning(f"⚠️ 컨텍스트 관리자 사용 불가: {e}")

 Generation router for Lily LLM API
 """
 from fastapi import APIRouter, HTTPException, Request, UploadFile, File, Form, Depends
+import os
 from typing import Optional, List
 import logging
 import time
 from ...services.generation_service import generate_sync
 from ...services.model_service import is_model_loaded
 from ...utils.system_utils import select_model_interactive
+from ...services.session_registry import get_user_for_room, set_user_for_room, set_user_for_session, pop_flag_for_room
 logger = logging.getLogger(__name__)
 router = APIRouter()
                   use_rag_images: bool = Form(False),
                   use_rag_text: bool = Form(False),
                   document_id: str = Form(None),
+                  image_short_side: int = Form(None),
+                  # 새 옵션: 생성 토큰 수/입력 길이 제어
+                  max_new_tokens: int = Form(None),
+                  input_max_length: int = Form(None)):
     if not is_model_loaded():
         raise HTTPException(status_code=503, detail="모델이 로드되지 않았습니다.")
         except Exception:
             pass
+    # 방/사용자 기반 고정 세션 ID로 정규화
+    stable_session_id = f"room_{room_id}_{user_id}"
+    if not session_id or session_id != stable_session_id:
+        original = session_id
+        session_id = stable_session_id
+        print(f"🔍 [DEBUG] 세션 ID 정규화: {original} -> {session_id}")
     else:
         # 제공된 세션에도 사용자 매핑 저장
         try:
     if use_context:
         try:
             from lily_llm_core.context_manager import context_manager
+            # 중기/장기 메모리 주입: room/user 기반
+            try:
+                from lily_llm_core.memory_store import memory_store
+                rm = memory_store.get_room_memory(room_id)
+                um = memory_store.get_user_memory(user_id)
+                note_parts = []
+                if rm and (rm.get("summary") or rm.get("key_topics")):
+                    note_parts.append(f"[Room Memory]\n{rm.get('summary','')}\nTopics: {rm.get('key_topics','')}")
+                if um and um.get("notes"):
+                    note_parts.append(f"[User Memory]\n{um.get('notes','')}")
+                if note_parts:
+                    context_manager.set_system_note(session_id, "hier_mem", "\n\n".join(note_parts))
+            except Exception as _me:
+                print(f"⚠️ [DEBUG] 메모리 주입 실패: {_me}")
+            # 컨텍스트 영속화 설정: 필요 시 세션 파일에서 로드
+            try:
+                persist = os.getenv('LILY_CONTEXT_PERSIST', '0') in ['1', 'true', 'True']
+                if persist:
+                    base_dir = os.getenv('LILY_CONTEXT_DIR', 'data/context')
+                    os.makedirs(base_dir, exist_ok=True)
+                    session_file = os.path.join(base_dir, f"{session_id}.json")
+                    if hasattr(context_manager, 'load_session_from_file'):
+                        context_manager.load_session_from_file(session_id, session_file)
+            except Exception as _e:
+                print(f"⚠️ [DEBUG] 컨텍스트 로드 실패: {_e}")
             context_manager.add_user_message(prompt, metadata={"session_id": session_id})
             print(f"🔍 [DEBUG] 사용자 메시지 추가됨 (세션: {session_id})")
         except Exception as e:
                 logger.warning(f"이미지 로드 실패: {e}")
     try:
+        # UX: 첨부 이미지가 있으면 자동으로 멀티모달 허용
+        if image_data_list and len([img for img in image_data_list if img]) > 0:
+            use_rag_images = True
+            # 이미지가 있으면 텍스트 컨텍스트는 유지하고, RAG 텍스트는 기본 그대로 사용
+        # 텍스트-only 강제: 첨부 이미지가 전혀 없고 플래그가 False면 이미지 리스트를 비움
+        if (not image_data_list or len([img for img in image_data_list if img]) == 0) and not use_rag_images:
+            # 업로드 직후 1회 한정 자동 이미지 허용 플래그가 있으면 소비(pop)하여 True로 전환
+            try:
+                once_flag = pop_flag_for_room(room_id, "use_rag_images_once")
+                if once_flag:
+                    use_rag_images = True
+                    print(f"🔍 [DEBUG] 룸 {room_id}에 저장된 1회성 이미지 복구 플래그 사용 -> use_rag_images=True")
+            except Exception:
+                pass
+            image_data_list = []
         # generate_sync 함수 호출 (컨텍스트 포함)
+        result = generate_sync(
+            prompt,
+            image_data_list,
+            # 생성 길이: 신/구 파라미터 모두 지원 (신>구 우선)
+            max_length=max_new_tokens,
+            use_context=use_context,
+            session_id=session_id,
+            user_id=user_id,
+            room_id=room_id,
+            use_rag_images=use_rag_images,
+            use_rag_text=use_rag_text,
+            document_id=document_id,
+            image_short_side=image_short_side,
+            input_max_length=input_max_length
+        )
         if "error" in result:
             raise HTTPException(status_code=500, detail=result["error"])
             try:
                 from lily_llm_core.context_manager import context_manager
                 context_manager.add_assistant_message(result["generated_text"], metadata={"session_id": session_id})
+                # 컨텍스트 영속화: 턴 저장 후 파일로 저장
+                try:
+                    persist = os.getenv('LILY_CONTEXT_PERSIST', '0') in ['1', 'true', 'True']
+                    if persist:
+                        base_dir = os.getenv('LILY_CONTEXT_DIR', 'data/context')
+                        os.makedirs(base_dir, exist_ok=True)
+                        session_file = os.path.join(base_dir, f"{session_id}.json")
+                        if hasattr(context_manager, 'save_session_to_file'):
+                            context_manager.save_session_to_file(session_id, session_file)
+                    # 중기(room) 메모리 갱신: 최근 턴 요약을 압축 저장
+                    try:
+                        from lily_llm_core.memory_store import memory_store
+                        # 간단히 최근 요약 컨텍스트를 저장하고 키토픽은 비움(후속 확장 지점)
+                        summary_text = ""
+                        if hasattr(context_manager, 'get_summary_context'):
+                            summary_text = context_manager.get_summary_context(session_id)
+                        last_ts = time.time()
+                        memory_store.upsert_room_memory(room_id, summary_text, "", last_ts)
+                    except Exception as _ms:
+                        print(f"⚠️ [DEBUG] 룸 메모리 저장 실패: {_ms}")
+                except Exception as _e:
+                    print(f"⚠️ [DEBUG] 컨텍스트 저장 실패: {_e}")
             except Exception as e:
                 logger.warning(f"⚠️ 컨텍스트 관리자 사용 불가: {e}")

lily_llm_api/api/routers/multimodal_rag_router.py CHANGED Viewed

@@ -82,7 +82,11 @@ async def generate_hybrid_rag_response(
     use_image: bool = Form(True),
     use_latex: bool = Form(True),
     use_latex_ocr: bool = Form(False),  # LaTeX-OCR 기능이 비활성화됨
     max_length: Optional[int] = Form(None),
     temperature: Optional[float] = Form(None),
     top_p: Optional[float] = Form(None),
     do_sample: Optional[bool] = Form(None)
@@ -91,10 +95,13 @@ async def generate_hybrid_rag_response(
     try:
         try:
             from lily_llm_core.hybrid_rag_processor import hybrid_rag_processor
             result = hybrid_rag_processor.generate_hybrid_response(
                 query, user_id, document_id,
                 use_text, use_image, use_latex, use_latex_ocr,
-                max_length, temperature, top_p, do_sample
             )
         except ImportError:
             result = {

     use_image: bool = Form(True),
     use_latex: bool = Form(True),
     use_latex_ocr: bool = Form(False),  # LaTeX-OCR 기능이 비활성화됨
+    # 생성 길이: 기존 max_length에서 새로운 max_new_tokens로 이행
     max_length: Optional[int] = Form(None),
+    max_new_tokens: Optional[int] = Form(None),
+    # 입력 토큰 상한(토크나이즈 최대 길이)
+    input_max_length: Optional[int] = Form(None),
     temperature: Optional[float] = Form(None),
     top_p: Optional[float] = Form(None),
     do_sample: Optional[bool] = Form(None)
     try:
         try:
             from lily_llm_core.hybrid_rag_processor import hybrid_rag_processor
+            # 신/구 파라미터 정리: 신 > 구 우선 적용
+            effective_max_new = max_new_tokens if max_new_tokens is not None else max_length
             result = hybrid_rag_processor.generate_hybrid_response(
                 query, user_id, document_id,
                 use_text, use_image, use_latex, use_latex_ocr,
+                effective_max_new, temperature, top_p, do_sample,
+                input_max_length=input_max_length
             )
         except ImportError:
             result = {

lily_llm_api/api/routers/user_memory_router.py CHANGED Viewed

@@ -7,6 +7,64 @@ import time
 logger = logging.getLogger(__name__)
 router = APIRouter()
 # ============================================================================
 # 사용자 메모리 설정 관리 API

 logger = logging.getLogger(__name__)
 router = APIRouter()
+# ---------------------------------------------------------------------------
+# 1) 사용자 장기 메모 쓰기 경로 (프로필/노트 업데이트)
+# ---------------------------------------------------------------------------
+@router.post("/user/memory/notes/{user_id}")
+async def upsert_user_long_memory(
+    user_id: str,
+    notes: str = Form("")
+):
+    """사용자 장기 메모(노트) 업데이트 (DB: memory_store, 파일: user_memory_manager 동시 업데이트)"""
+    try:
+        updated = False
+        try:
+            from lily_llm_core.memory_store import memory_store
+            memory_store.upsert_user_memory(user_id, notes)
+            updated = True
+        except Exception as _e:
+            pass
+        try:
+            from lily_llm_core.user_memory_manager import user_memory_manager
+            user_memory_manager.update_user_memory(user_id, {"important_info": [], "preferences": {}, "name": None})
+            # 노트를 별도 키에 반영하고 싶다면 preferences 등에 병합 가능
+        except Exception as _e:
+            pass
+        if updated:
+            return {"status": "success", "message": "사용자 장기 메모 업데이트 완료"}
+        return {"status": "error", "message": "업데이트에 실패했습니다"}
+    except Exception as e:
+        return {"status": "error", "message": str(e)}
+# ---------------------------------------------------------------------------
+# 2) Room 요약 향상: 키토픽 추출 및 압축 요약 저장
+# ---------------------------------------------------------------------------
+@router.post("/room/memory/summary/{room_id}")
+async def upsert_room_summary(
+    room_id: str,
+    summary: str = Form(""),
+    key_topics: str = Form("")
+):
+    """방 요약/키토픽 저장 (DB)"""
+    try:
+        from lily_llm_core.memory_store import memory_store
+        import time
+        # summary가 비어있으면 현재 세션 컨텍스트 요약을 사용 가능
+        if not summary:
+            try:
+                from lily_llm_core.context_manager import context_manager
+                session_id = f"room_{room_id}_group"  # 기본 세션 키 가정
+                summary = context_manager.get_summary_context(session_id)
+                topics = context_manager.get_key_topics(session_id)
+                if topics and not key_topics:
+                    key_topics = ",".join(topics)
+            except Exception:
+                pass
+        memory_store.upsert_room_memory(room_id, summary, key_topics, time.time())
+        return {"status": "success", "message": "Room 요약 저장 완료"}
+    except Exception as e:
+        return {"status": "error", "message": str(e)}
 # ============================================================================
 # 사용자 메모리 설정 관리 API

lily_llm_api/app.py CHANGED Viewed

@@ -231,12 +231,17 @@ async def lifespan(app: FastAPI):
             context_manager.set_summary_method("smart")
             logger.info("✅ 고급 컨텍스트 관리자 설정 완료: smart 요약 방법 활성화")
-            # 자동 정리 설정 최적화
             context_manager.set_auto_cleanup_config(
-                enabled=True,
-                interval_turns=5,      # 5턴마다 정리
-                interval_time=180,     # 3분마다 정리
-                strategy="aggressive"   # 적극적 정리로 메모리 최적화
             )
             logger.info("✅ 자동 정리 설정 최적화 완료")

             context_manager.set_summary_method("smart")
             logger.info("✅ 고급 컨텍스트 관리자 설정 완료: smart 요약 방법 활성화")
+            # 자동 정리 설정 (환경변수로 오버라이드)
+            import os
+            enabled = os.getenv('LILY_CONTEXT_AUTOCLEAN_ENABLED', '1') in ['1', 'true', 'True']
+            interval_turns = int(os.getenv('LILY_CONTEXT_AUTOCLEAN_TURNS', '12'))
+            interval_time = int(os.getenv('LILY_CONTEXT_AUTOCLEAN_TIME', '600'))
+            strategy = os.getenv('LILY_CONTEXT_CLEANUP_STRATEGY', 'smart')
             context_manager.set_auto_cleanup_config(
+                enabled=enabled,
+                interval_turns=interval_turns,
+                interval_time=interval_time,
+                strategy=strategy
             )
             logger.info("✅ 자동 정리 설정 최적화 완료")

lily_llm_api/core/app_factory.py CHANGED Viewed

@@ -48,12 +48,17 @@ async def create_lifespan_handler(app):
             context_manager.set_summary_method("smart")
             logger.info("✅ 고급 컨텍스트 관리자 설정 완료: smart 요약 방법 활성화")
-            # 자동 정리 설정 최적화
             context_manager.set_auto_cleanup_config(
-                enabled=True,
-                interval_turns=5,      # 5턴마다 정리
-                interval_time=180,     # 3분마다 정리
-                strategy="aggressive"   # 적극적 정리로 메모리 최적화
             )
             logger.info("✅ 자동 정리 설정 최적화 완료")

             context_manager.set_summary_method("smart")
             logger.info("✅ 고급 컨텍스트 관리자 설정 완료: smart 요약 방법 활성화")
+            # 자동 정리 설정 (환경변수로 오버라이드)
+            import os
+            enabled = os.getenv('LILY_CONTEXT_AUTOCLEAN_ENABLED', '1') in ['1', 'true', 'True']
+            interval_turns = int(os.getenv('LILY_CONTEXT_AUTOCLEAN_TURNS', '12'))
+            interval_time = int(os.getenv('LILY_CONTEXT_AUTOCLEAN_TIME', '600'))
+            strategy = os.getenv('LILY_CONTEXT_CLEANUP_STRATEGY', 'smart')
             context_manager.set_auto_cleanup_config(
+                enabled=enabled,
+                interval_turns=interval_turns,
+                interval_time=interval_time,
+                strategy=strategy
             )
             logger.info("✅ 자동 정리 설정 최적화 완료")

lily_llm_api/lily_llm_api (2).zip ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7b6fb58fe03727aade71d05ee53c4f9a20d7dbba7edf4109467b43f05d577304
+size 1141004

lily_llm_api/lily_llm_api (3).zip ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7b6fb58fe03727aade71d05ee53c4f9a20d7dbba7edf4109467b43f05d577304
+size 1141004

lily_llm_api/lily_llm_api (4).zip ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0ae06631193791cd4e6ca17c13043776014d7a3407ca0aa326defd17f2e819c1
+size 1144984

lily_llm_api/lily_llm_api (5).zip ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:36963d2dcdf0aa657731f7d2824add788c1f1eae5ba057d89cdd8e8e95c2e2c8
+size 1154904

lily_llm_api/lily_llm_api (6).zip ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:062f623c164562954d2f8ac9631416eed4b7d2b8e31a916a4dfe6bd234d1ec94
+size 1153684

lily_llm_api/lily_llm_api.zip ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f9585cfd3668c8377d7b1a1971084b479212eabc1e1657b6bd8aa4513247a756
+size 1136689

lily_llm_api/models/kanana_1_5_v_3b_instruct.py CHANGED Viewed

@@ -134,12 +134,30 @@ class Kanana15V3bInstructProfile:
             )
             device = 'cuda' if torch.cuda.is_available() else 'cpu'
-            # 공식 설정 파일 bfloat16 사용, float32 사용시 메모리 에러 발생
             if device == 'cuda':
                 selected_dtype = torch.bfloat16
             else:
-                selected_dtype = torch.bfloat16
             logger.info(f"🔧 선택된 dtype: {selected_dtype} (device: {device})")
@@ -167,8 +185,16 @@ class Kanana15V3bInstructProfile:
     def get_generation_config(self) -> Dict[str, Any]:
         return {
-            "max_new_tokens": 128,
             "do_sample": True,
             "temperature": 0.7,
             "top_k": 50,
@@ -178,13 +204,6 @@ class Kanana15V3bInstructProfile:
             "pad_token_id": 128001,
             "eos_token_id": 128009,
             "bos_token_id": 128000,
-            # "use_cache": False,
-            # "early_stopping": False,
-            # "num_beams": 1,
-            # "num_return_sequences": 1,
-            # "return_full_text": False,
-            # "return_dict": False,
-            # "return_dict_in_generate": False,
         }
     def extract_response(self, full_text: str, formatted_prompt: str = None, **kwargs) -> str:

             )
             device = 'cuda' if torch.cuda.is_available() else 'cpu'
+            # 환경변수로 dtype 제어 (기본: CPU=float32, CUDA=bfloat16)
+            env_dtype = (os.getenv('LILY_FORCE_DTYPE') or os.getenv('LILY_CPU_DTYPE') if device=='cpu' else os.getenv('LILY_CUDA_DTYPE'))
             if device == 'cuda':
+                env_dtype = os.getenv('LILY_FORCE_DTYPE') or os.getenv('LILY_CUDA_DTYPE')
                 selected_dtype = torch.bfloat16
+                if env_dtype:
+                    m = env_dtype.lower()
+                    if m in ('float16','fp16'):
+                        selected_dtype = torch.float16
+                    elif m in ('float32','fp32'):
+                        selected_dtype = torch.float32
+                    elif m in ('bfloat16','bf16'):
+                        selected_dtype = torch.bfloat16
             else:
+                # CPU 기본은 float32 (속도/호환성)
+                selected_dtype = torch.float32
+                if env_dtype:
+                    m = env_dtype.lower()
+                    if m in ('float16','fp16'):
+                        selected_dtype = torch.float16
+                    elif m in ('float32','fp32'):
+                        selected_dtype = torch.float32
+                    elif m in ('bfloat16','bf16'):
+                        selected_dtype = torch.bfloat16
             logger.info(f"🔧 선택된 dtype: {selected_dtype} (device: {device})")
     def get_generation_config(self) -> Dict[str, Any]:
+        import os
+        def _get_int(env_key: str, default_val: int) -> int:
+            try:
+                v = int(os.getenv(env_key, str(default_val)))
+                return v if v > 0 else default_val
+            except Exception:
+                return default_val
+        max_new = _get_int('LILY_MAX_NEW_TOKENS', 128)
         return {
+            "max_new_tokens": max_new,
             "do_sample": True,
             "temperature": 0.7,
             "top_k": 50,
             "pad_token_id": 128001,
             "eos_token_id": 128009,
             "bos_token_id": 128000,
         }
     def extract_response(self, full_text: str, formatted_prompt: str = None, **kwargs) -> str:

lily_llm_api/services/generation_service.py CHANGED Viewed

@@ -34,7 +34,8 @@ def generate_sync(prompt: str, image_data_list: Optional[List[bytes]], max_lengt
                  do_sample: Optional[bool] = None, use_context: bool = True, session_id: str = None,
                  user_id: str = "anonymous", room_id: str = "default", use_rag_images: bool = False,
                  use_rag_text: bool = False, document_id: Optional[str] = None,
-                 image_short_side: Optional[int] = None) -> dict:
     """[최적화] 모델 생성을 처리하는 통합 동기 함수"""
     try:
         from .model_service import get_current_profile, get_current_model
@@ -78,11 +79,40 @@ def generate_sync(prompt: str, image_data_list: Optional[List[bytes]], max_lengt
         print(f"🔍 [DEBUG] 모델 이름: {getattr(current_profile, 'model_name', 'Unknown')}")
         print(f"🔍 [DEBUG] 멀티모달 지원: {getattr(current_profile, 'multimodal', False)}")
         print(f"🔍 [DEBUG] 입력 프롬프트: {prompt}")
         print(f"🔍 [DEBUG] 입력 프롬프트 길이: {len(prompt)}")
         print(f"🔍 [DEBUG] 이미지 데이터 존재 여부: {image_data_list is not None}")
         print(f"🔍 [DEBUG] 이미지 데이터 개수: {len(image_data_list) if image_data_list else 0}")
         print(f"🔍 [DEBUG] 실제 이미지 데이터 개수: {len([img for img in image_data_list if img]) if image_data_list else 0}")
         image_processed = False
         all_pixel_values = []
@@ -94,11 +124,14 @@ def generate_sync(prompt: str, image_data_list: Optional[List[bytes]], max_lengt
             all_image_data.extend(image_data_list)
             print(f"🔍 [DEBUG] 직접 전달된 이미지 {len(image_data_list)}개 추가")
         else:
-            # 현재 요청에 이미지가 없으면 세션 캐시에서 복구 시도
-            if session_id and session_id in _session_image_cache and len(_session_image_cache[session_id]) > 0:
-                cached_imgs = _session_image_cache[session_id]
-                all_image_data.extend(cached_imgs)
-                print(f"🔍 [DEBUG] 세션 캐시에서 이전 이미지 {len(cached_imgs)}개 복구 (세션: {session_id})")
         # 추가 복구: 여전히 이미지가 없고 멀티모달이며, 명시적으로 허용된 경우에만 RAG에서 이미지 복원
         if use_rag_images and (not all_image_data or len([img for img in all_image_data if img]) == 0) and getattr(current_profile, 'multimodal', False):
@@ -138,7 +171,8 @@ def generate_sync(prompt: str, image_data_list: Optional[List[bytes]], max_lengt
             except Exception as e:
                 print(f"⚠️ [DEBUG] RAG 기반 이미지 복구 실패: {e}")
         elif not use_rag_images and getattr(current_profile, 'multimodal', False):
-            print("🔍 [DEBUG] RAG 이미지 복구 비활성화됨(use_rag_images=False) - 텍스트 전용 유지")
         # 항상 참조 가능한 max_images 정의 (이미지 없으면 0)
         # 1차 상한은 4장으로 제한 (최종 선택은 예산 기반 동적 선택에서 결정)
@@ -234,6 +268,8 @@ def generate_sync(prompt: str, image_data_list: Optional[List[bytes]], max_lengt
             # 🔧 이미지 토큰 예산 기반 동적 선택 (멀티모달 길이 초과 방지)
             try:
                 # 1) 이미지별 토큰 수 산출
                 per_image_tokens: List[int] = []
                 if isinstance(combined_image_metas, dict) and 'image_token_thw' in combined_image_metas:
@@ -247,7 +283,8 @@ def generate_sync(prompt: str, image_data_list: Optional[List[bytes]], max_lengt
                     per_image_tokens = [3000] * len(all_pixel_values)
                 # 2) 텍스트 길이 측정 (이미지 토큰 제외한 프롬프트)
-                base_text_prompt = f"Human: {prompt}\nAssistant:"
                 text_inputs = tokenizer(
                     base_text_prompt,
                     return_tensors="pt",
@@ -375,8 +412,9 @@ def generate_sync(prompt: str, image_data_list: Optional[List[bytes]], max_lengt
             num_images = len(all_pixel_values)
             image_tokens = "<image>" * num_images  # 이미지 개수만큼 <image> 토큰 생성
             # 답변 유도를 위해 Assistant 프리픽스 추가
-            # 길이 초과를 방지하기 위해 멀티모달 경로에서는 사용자 입력만 포함
-            formatted_prompt = f"Human: {image_tokens}{prompt}\nAssistant:"
             print(f"🔍 [DEBUG] 멀티모달 프롬프트 구성 (공식 형식): {formatted_prompt}")
             print(f"🔍 [DEBUG] 이미지 토큰 생성: {num_images}개 이미지 -> {image_tokens}")
             image_processed = True
@@ -412,7 +450,8 @@ def generate_sync(prompt: str, image_data_list: Optional[List[bytes]], max_lengt
                 print(f"🔍 [DEBUG] 기본 프롬프트 사용 (컨텍스트 포함): {formatted_prompt}")
         print(f"🔍 [DEBUG] 프롬프트 구성 완료 - 길이: {len(formatted_prompt) if formatted_prompt else 0}")
-        print(f"🔍 [DEBUG] 최종 프롬프트: {formatted_prompt}")
         # --- 3. 토크나이징 ---
         print(f"🔍 [DEBUG] 토크나이징 시작")
@@ -423,12 +462,18 @@ def generate_sync(prompt: str, image_data_list: Optional[List[bytes]], max_lengt
             print(f"🔍 [DEBUG] 텍스트-only 토크나이징 경로")
             print(f"🔍 [DEBUG] 사용할 프롬프트: {formatted_prompt}")
             inputs = tokenizer(
                 formatted_prompt,
                 return_tensors="pt",
                 padding=True,
                 truncation=True,
-                max_length=2048,
             )
             if 'token_type_ids' in inputs:
                 del inputs['token_type_ids']
@@ -514,7 +559,7 @@ def generate_sync(prompt: str, image_data_list: Optional[List[bytes]], max_lengt
                     # 🔄 공식 방식: max_length 파라미터 추가
                     inputs = tokenizer.encode_prompt(
                         prompt=formatted_prompt,
-                        max_length=2048,  # 공식 코드와 동일
                         image_meta=final_meta
                     )
                     print(f"🔍 [DEBUG] encode_prompt 출력: {list(inputs.keys())}")
@@ -560,7 +605,7 @@ def generate_sync(prompt: str, image_data_list: Optional[List[bytes]], max_lengt
                                 print(f"🔁 [DEBUG] 재시도 limit={limit}: {base_prompt_retry}")
                                 inputs_retry = tokenizer.encode_prompt(
                                     prompt=base_prompt_retry,
-                                    max_length=2048,
                                     image_meta=final_meta
                                 )
                                 # 정규화
@@ -585,7 +630,7 @@ def generate_sync(prompt: str, image_data_list: Optional[List[bytes]], max_lengt
                         print(f"🔁 [DEBUG] encode_prompt 재시도(컨텍스트 제거): {base_prompt_retry}")
                         inputs = tokenizer.encode_prompt(
                             prompt=base_prompt_retry,
-                            max_length=2048,
                             image_meta=final_meta
                         )
                         print(f"🔍 [DEBUG] encode_prompt 재시도 성공: {list(inputs.keys())}")
@@ -599,7 +644,7 @@ def generate_sync(prompt: str, image_data_list: Optional[List[bytes]], max_lengt
                             return_tensors="pt",
                             padding=True,
                             truncation=True,
-                            max_length=2048,
                         )
                         if 'token_type_ids' in inputs:
                             del inputs['token_type_ids']
@@ -613,7 +658,7 @@ def generate_sync(prompt: str, image_data_list: Optional[List[bytes]], max_lengt
                     return_tensors="pt",
                     padding=True,
                     truncation=True,
-                    max_length=2048,
                 )
                 if 'token_type_ids' in inputs:
                     del inputs['token_type_ids']
@@ -675,6 +720,20 @@ def generate_sync(prompt: str, image_data_list: Optional[List[bytes]], max_lengt
         if do_sample is not None:
             gen_config['do_sample'] = do_sample
         print(f"🔍 [DEBUG] 생성 설정: {gen_config}")
         # --- 5. 실제 추론 실행 ---
@@ -948,31 +1007,39 @@ def generate_sync(prompt: str, image_data_list: Optional[List[bytes]], max_lengt
                                     'attention_mask': attention_mask
                                 }
-                                generated_ids = lora_model.generate(
-                                    **lora_inputs,
-                                    **gen_config
-                                )
                             else:
                                 print(f"⚠️ [DEBUG] LoRA 모델을 가져올 수 없음, 기본 모델 사용")
                                 generated_ids = current_model.generate(
                                     input_ids=input_ids,
                                     attention_mask=attention_mask,
                                     **gen_config
                                 )
-                        else:
-                            print(f"🔍 [DEBUG] LoRA 어댑터 없음, 기본 모델 사용")
                             generated_ids = current_model.generate(
                                 input_ids=input_ids,
                                 attention_mask=attention_mask,
                                 **gen_config
                             )
-                    except ImportError:
-                        print(f"🔍 [DEBUG] LoRA 지원 안됨, 기본 모델 사용")
-                        generated_ids = current_model.generate(
-                            input_ids=input_ids,
-                            attention_mask=attention_mask,
-                            **gen_config
-                        )
                     print(f"🔍 [DEBUG] 모델 생성 완료 시간: {time.time()}")
@@ -998,9 +1065,10 @@ def generate_sync(prompt: str, image_data_list: Optional[List[bytes]], max_lengt
         try:
             # 생성된 텍스트 디코딩
             full_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
-            print(f"🔍 [DEBUG] 전체 텍스트 길이: {len(full_text)}")
-            print(f"🔍 [DEBUG] 전체 생성 텍스트 (Raw): \n---\n{full_text}\n---")
-            print(f"🔍 [DEBUG] 사용된 프롬프트: {formatted_prompt}")
             # 프로필별 응답 추출 (안전한 방식)
             if hasattr(current_profile, 'extract_response'):

                  do_sample: Optional[bool] = None, use_context: bool = True, session_id: str = None,
                  user_id: str = "anonymous", room_id: str = "default", use_rag_images: bool = False,
                  use_rag_text: bool = False, document_id: Optional[str] = None,
+                 image_short_side: Optional[int] = None,
+                 input_max_length: Optional[int] = None) -> dict:
     """[최적화] 모델 생성을 처리하는 통합 동기 함수"""
     try:
         from .model_service import get_current_profile, get_current_model
         print(f"🔍 [DEBUG] 모델 이름: {getattr(current_profile, 'model_name', 'Unknown')}")
         print(f"🔍 [DEBUG] 멀티모달 지원: {getattr(current_profile, 'multimodal', False)}")
+        # 속도/로그 모드 플래그 (환경변수)
+        speed_mode = os.getenv('LILY_SPEED_MODE', '0') == '1'
+        debug_log_prompt = os.getenv('LILY_DEBUG_LOG_PROMPT', '0') == '1'
+        debug_log_text = os.getenv('LILY_DEBUG_LOG_TEXT', '0') == '1'
+        # CPU에서 bf16은 느릴 수 있으므로 필요 시 float32로 전환
+        try:
+            import torch as _torch
+            if hasattr(current_model, 'device') and str(current_model.device) == 'cpu':
+                desired = (os.getenv('LILY_FORCE_DTYPE') or os.getenv('LILY_CPU_DTYPE') or 'float32').lower()
+                desired_map = {
+                    'float32': _torch.float32,
+                    'fp32': _torch.float32,
+                    'bfloat16': _torch.bfloat16,
+                    'bf16': _torch.bfloat16,
+                    'float16': _torch.float16,
+                    'fp16': _torch.float16,
+                }
+                target_dtype = desired_map.get(desired, _torch.float32)
+                if hasattr(current_model, 'dtype') and current_model.dtype != target_dtype:
+                    print(f"🔧 [SPEED] CPU dtype 전환: {current_model.dtype} -> {target_dtype}")
+                    current_model = current_model.to(target_dtype)
+        except Exception as _dtype_e:
+            print(f"⚠️ [SPEED] dtype 전환 실패: {_dtype_e}")
+        # 항상 eval 모드
+        try:
+            current_model.eval()
+        except Exception:
+            pass
         print(f"🔍 [DEBUG] 입력 프롬프트: {prompt}")
         print(f"🔍 [DEBUG] 입력 프롬프트 길이: {len(prompt)}")
         print(f"🔍 [DEBUG] 이미지 데이터 존재 여부: {image_data_list is not None}")
         print(f"🔍 [DEBUG] 이미지 데이터 개수: {len(image_data_list) if image_data_list else 0}")
         print(f"🔍 [DEBUG] 실제 이미지 데이터 개수: {len([img for img in image_data_list if img]) if image_data_list else 0}")
+        print(f"🔍 [DEBUG] use_rag_images 플래그: {use_rag_images}")
         image_processed = False
         all_pixel_values = []
             all_image_data.extend(image_data_list)
             print(f"🔍 [DEBUG] 직접 전달된 이미지 {len(image_data_list)}개 추가")
         else:
+            # 현재 요청에 이미지가 없으면 (옵션) 세션 캐시에서 복구
+            if use_rag_images:
+                if session_id and session_id in _session_image_cache and len(_session_image_cache[session_id]) > 0:
+                    cached_imgs = _session_image_cache[session_id]
+                    all_image_data.extend(cached_imgs)
+                    print(f"🔍 [DEBUG] 세션 캐시에서 이전 이미지 {len(cached_imgs)}개 복구 (세션: {session_id})")
+            else:
+                print("🔍 [DEBUG] 세션 캐시 복구 비활성화(use_rag_images=False)")
         # 추가 복구: 여전히 이미지가 없고 멀티모달이며, 명시적으로 허용된 경우에만 RAG에서 이미지 복원
         if use_rag_images and (not all_image_data or len([img for img in all_image_data if img]) == 0) and getattr(current_profile, 'multimodal', False):
             except Exception as e:
                 print(f"⚠️ [DEBUG] RAG 기반 이미지 복구 실패: {e}")
         elif not use_rag_images and getattr(current_profile, 'multimodal', False):
+            # RAG 기반의 추가 이미지 복구만 비활성화. 직접 첨부 이미지는 유지한다.
+            print("🔍 [DEBUG] RAG 이미지 복구 비활성화됨(use_rag_images=False) - 직접 첨부 이미지만 허용")
         # 항상 참조 가능한 max_images 정의 (이미지 없으면 0)
         # 1차 상한은 4장으로 제한 (최종 선택은 예산 기반 동적 선택에서 결정)
             # 🔧 이미지 토큰 예산 기반 동적 선택 (멀티모달 길이 초과 방지)
             try:
+                # 예산 계산에서 사용할 RAG 스니펫 기본값 (후단에서 실제 계산됨)
+                rag_snippet_short = ""
                 # 1) 이미지별 토큰 수 산출
                 per_image_tokens: List[int] = []
                 if isinstance(combined_image_metas, dict) and 'image_token_thw' in combined_image_metas:
                     per_image_tokens = [3000] * len(all_pixel_values)
                 # 2) 텍스트 길이 측정 (이미지 토큰 제외한 프롬프트)
+                #    멀티모달에서도 RAG 스니펫(축약)을 포함하여 텍스트 길이를 산정
+                base_text_prompt = f"Human: {rag_snippet_short}{prompt}\nAssistant:"
                 text_inputs = tokenizer(
                     base_text_prompt,
                     return_tensors="pt",
             num_images = len(all_pixel_values)
             image_tokens = "<image>" * num_images  # 이미지 개수만큼 <image> 토큰 생성
             # 답변 유도를 위해 Assistant 프리픽스 추가
+            # 멀티모달에서도 RAG 텍스트 스니펫(축약)을 앞에 포함하여 텍스트 근거를 반영
+            mm_text = f"{rag_snippet_short}{prompt}" if rag_snippet_short else prompt
+            formatted_prompt = f"Human: {image_tokens}{mm_text}\nAssistant:"
             print(f"🔍 [DEBUG] 멀티모달 프롬프트 구성 (공식 형식): {formatted_prompt}")
             print(f"🔍 [DEBUG] 이미지 토큰 생성: {num_images}개 이미지 -> {image_tokens}")
             image_processed = True
                 print(f"🔍 [DEBUG] 기본 프롬프트 사용 (컨텍스트 포함): {formatted_prompt}")
         print(f"🔍 [DEBUG] 프롬프트 구성 완료 - 길이: {len(formatted_prompt) if formatted_prompt else 0}")
+        if debug_log_prompt:
+            print(f"🔍 [DEBUG] 최종 프롬프트: {formatted_prompt}")
         # --- 3. 토크나이징 ---
         print(f"🔍 [DEBUG] 토크나이징 시작")
             print(f"🔍 [DEBUG] 텍스트-only 토크나이징 경로")
             print(f"🔍 [DEBUG] 사용할 프롬프트: {formatted_prompt}")
+            # 입력 최대 길이: 사용자 입력 > 환경변수 > 기본값
+            try:
+                env_input_max_len = int(os.getenv('LILY_INPUT_MAX_LENGTH', '0'))
+            except Exception:
+                env_input_max_len = 0
+            effective_input_max_len = input_max_length or (env_input_max_len if env_input_max_len > 0 else (1024 if speed_mode else 2048))
             inputs = tokenizer(
                 formatted_prompt,
                 return_tensors="pt",
                 padding=True,
                 truncation=True,
+                max_length=effective_input_max_len,
             )
             if 'token_type_ids' in inputs:
                 del inputs['token_type_ids']
                     # 🔄 공식 방식: max_length 파라미터 추가
                     inputs = tokenizer.encode_prompt(
                         prompt=formatted_prompt,
+                        max_length=effective_input_max_len,
                         image_meta=final_meta
                     )
                     print(f"🔍 [DEBUG] encode_prompt 출력: {list(inputs.keys())}")
                                 print(f"🔁 [DEBUG] 재시도 limit={limit}: {base_prompt_retry}")
                                 inputs_retry = tokenizer.encode_prompt(
                                     prompt=base_prompt_retry,
+                                    max_length=effective_input_max_len,
                                     image_meta=final_meta
                                 )
                                 # 정규화
                         print(f"🔁 [DEBUG] encode_prompt 재시도(컨텍스트 제거): {base_prompt_retry}")
                         inputs = tokenizer.encode_prompt(
                             prompt=base_prompt_retry,
+                            max_length=effective_input_max_len,
                             image_meta=final_meta
                         )
                         print(f"🔍 [DEBUG] encode_prompt 재시도 성공: {list(inputs.keys())}")
                             return_tensors="pt",
                             padding=True,
                             truncation=True,
+                            max_length=effective_input_max_len,
                         )
                         if 'token_type_ids' in inputs:
                             del inputs['token_type_ids']
                     return_tensors="pt",
                     padding=True,
                     truncation=True,
+                    max_length=effective_input_max_len,
                 )
                 if 'token_type_ids' in inputs:
                     del inputs['token_type_ids']
         if do_sample is not None:
             gen_config['do_sample'] = do_sample
+        # 속도 우선 모드일 때 생성 설정 다이어트
+        if speed_mode:
+            try:
+                gen_config['do_sample'] = False
+                gen_config['top_k'] = None
+                gen_config['top_p'] = 1.0
+                gen_config['repetition_penalty'] = 1.0
+                gen_config['no_repeat_ngram_size'] = 0
+                if 'max_new_tokens' in gen_config and gen_config['max_new_tokens'] is not None:
+                    gen_config['max_new_tokens'] = min(int(gen_config['max_new_tokens']), 64)
+                else:
+                    gen_config['max_new_tokens'] = 64
+            except Exception as _e_cfg:
+                print(f"⚠️ [SPEED] 생성 설정 다이어트 실패: {_e_cfg}")
         print(f"🔍 [DEBUG] 생성 설정: {gen_config}")
         # --- 5. 실제 추론 실행 ---
                                     'attention_mask': attention_mask
                                 }
+                                import torch as _torch
+                                with _torch.inference_mode():
+                                    generated_ids = lora_model.generate(
+                                        **lora_inputs,
+                                        **gen_config
+                                    )
                             else:
                                 print(f"⚠️ [DEBUG] LoRA 모델을 가져올 수 없음, 기본 모델 사용")
+                                import torch as _torch
+                                with _torch.inference_mode():
+                                    generated_ids = current_model.generate(
+                                        input_ids=input_ids,
+                                        attention_mask=attention_mask,
+                                        **gen_config
+                                    )
+                        else:
+                            print(f"🔍 [DEBUG] LoRA 어댑터 없음, 기본 모델 사용")
+                            import torch as _torch
+                            with _torch.inference_mode():
                                 generated_ids = current_model.generate(
                                     input_ids=input_ids,
                                     attention_mask=attention_mask,
                                     **gen_config
                                 )
+                    except ImportError:
+                        print(f"🔍 [DEBUG] LoRA 지원 안됨, 기본 모델 사용")
+                        import torch as _torch
+                        with _torch.inference_mode():
                             generated_ids = current_model.generate(
                                 input_ids=input_ids,
                                 attention_mask=attention_mask,
                                 **gen_config
                             )
                     print(f"🔍 [DEBUG] 모델 생성 완료 시간: {time.time()}")
         try:
             # 생성된 텍스트 디코딩
             full_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
+            if os.getenv('LILY_DEBUG_LOG_TEXT', '0') == '1':
+                print(f"🔍 [DEBUG] 전체 텍스트 길이: {len(full_text)}")
+                print(f"🔍 [DEBUG] 전체 생성 텍스트 (Raw): \n---\n{full_text}\n---")
+                print(f"🔍 [DEBUG] 사용된 프롬프트: {formatted_prompt}")
             # 프로필별 응답 추출 (안전한 방식)
             if hasattr(current_profile, 'extract_response'):

lily_llm_api/services/model_service.py CHANGED Viewed

@@ -2,6 +2,7 @@
 Model service for Lily LLM API
 """
 import logging
 import asyncio
 import concurrent.futures
 from typing import Optional
@@ -55,7 +56,32 @@ def load_model_sync(model_id: str):
         # 이제 load_model은 (model, processor)를 반환합니다.
         model, processor = current_profile.load_model()
         # 🔄 전역 변수에 모델 설정 (LoRA에서 사용)
         current_model = model

 Model service for Lily LLM API
 """
 import logging
+import os
 import asyncio
 import concurrent.futures
 from typing import Optional
         # 이제 load_model은 (model, processor)를 반환합니다.
         model, processor = current_profile.load_model()
+        # 🔧 서버 시작 시점에서 dtype 강제 적용 (첫 요청 지연 방지)
+        try:
+            import torch as _torch
+            # 디바이스별 대상 dtype 결정 (기본: CPU=float32, CUDA=bfloat16)
+            if hasattr(model, 'device') and str(model.device) == 'cpu':
+                desired = (os.getenv('LILY_FORCE_DTYPE') or os.getenv('LILY_CPU_DTYPE') or 'float32').lower()
+                default_target = _torch.float32
+            else:
+                desired = (os.getenv('LILY_FORCE_DTYPE') or os.getenv('LILY_CUDA_DTYPE') or 'bfloat16').lower()
+                default_target = _torch.bfloat16
+            desired_map = {
+                'float32': _torch.float32,
+                'fp32': _torch.float32,
+                'bfloat16': _torch.bfloat16,
+                'bf16': _torch.bfloat16,
+                'float16': _torch.float16,
+                'fp16': _torch.float16,
+            }
+            target_dtype = desired_map.get(desired, default_target)
+            if hasattr(model, 'dtype') and model.dtype != target_dtype:
+                logger.info(f"🔧 [SPEED][startup] dtype 적용: {model.dtype} -> {target_dtype}")
+                model = model.to(target_dtype)
+        except Exception as _dtype_e:
+            logger.warning(f"⚠️ [startup] dtype 적용 실패: {_dtype_e}")
         # 🔄 전역 변수에 모델 설정 (LoRA에서 사용)
         current_model = model

lily_llm_api/services/session_registry.py CHANGED Viewed

@@ -3,13 +3,15 @@
 - 프로세스 메모리 기반 (서비스 재시작 시 초기화)
 - 업로드/생성 간 user_id 불일치 보정용
 """
-from typing import Optional, Dict
 import time
 _room_to_user: Dict[str, str] = {}
 _session_to_user: Dict[str, str] = {}
 _last_user: Optional[str] = None
 _last_updated_at: float = 0.0
 def set_user_for_room(room_id: Optional[str], user_id: Optional[str]) -> None:
     if not room_id or not user_id:
@@ -50,4 +52,67 @@ def clear() -> None:
     _last_user = None
     _last_updated_at = 0.0

 - 프로세스 메모리 기반 (서비스 재시작 시 초기화)
 - 업로드/생성 간 user_id 불일치 보정용
 """
+from typing import Optional, Dict, Any
 import time
+import os
 _room_to_user: Dict[str, str] = {}
 _session_to_user: Dict[str, str] = {}
 _last_user: Optional[str] = None
 _last_updated_at: float = 0.0
+_room_flags: Dict[str, Dict[str, Dict[str, Any]]] = {}
 def set_user_for_room(room_id: Optional[str], user_id: Optional[str]) -> None:
     if not room_id or not user_id:
     _last_user = None
     _last_updated_at = 0.0
+# ---- room-scoped one-shot flags ----
+def set_flag_for_room(room_id: Optional[str], key: str, value: bool = True, ttl_seconds: Optional[float] = None) -> None:
+    if not room_id or not key:
+        return
+    rid = str(room_id)
+    flags = _room_flags.get(rid)
+    if flags is None:
+        flags = {}
+        _room_flags[rid] = flags
+    # TTL 결정: 인자 > 환경변수 > 기본 120초
+    if ttl_seconds is None:
+        try:
+            env_ttl = os.getenv('LILY_RAG_IMAGE_FLAG_TTL_SECONDS')
+            ttl_seconds = float(env_ttl) if env_ttl is not None else 120.0
+        except Exception:
+            ttl_seconds = 120.0
+    expires_at = time.time() + float(ttl_seconds) if ttl_seconds and ttl_seconds > 0 else None
+    flags[str(key)] = {"value": bool(value), "expires_at": expires_at}
+def get_flag_for_room(room_id: Optional[str], key: str) -> Optional[bool]:
+    if not room_id or not key:
+        return None
+    rid = str(room_id)
+    flags = _room_flags.get(rid) or {}
+    entry = flags.get(str(key))
+    if not entry:
+        return None
+    expires_at = entry.get("expires_at")
+    if expires_at is not None and time.time() > expires_at:
+        # 만료되면 제거
+        try:
+            flags.pop(str(key), None)
+        except Exception:
+            pass
+        return None
+    return bool(entry.get("value"))
+def pop_flag_for_room(room_id: Optional[str], key: str) -> Optional[bool]:
+    if not room_id or not key:
+        return None
+    rid = str(room_id)
+    flags = _room_flags.get(rid)
+    if not flags:
+        return None
+    entry = flags.get(str(key))
+    if not entry:
+        return None
+    expires_at = entry.get("expires_at")
+    if expires_at is not None and time.time() > expires_at:
+        # 만료된 경우 소비 없이 제거
+        try:
+            flags.pop(str(key), None)
+        except Exception:
+            pass
+        return None
+    try:
+        removed = flags.pop(str(key), None)
+    except Exception:
+        removed = None
+    if removed is None:
+        return None
+    return bool(removed.get("value"))

lily_llm_core/lily_llm_core (2).zip ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f9ad7c0597cf61ee2dc67f5891da819b1809640835bf11123d8b4a144d48b8fe
+size 1934920

lily_llm_core/lily_llm_core (3).zip ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fc71cf6d83b343051d3a96996fb4444ba213219c87d20c49abe82e077234adf5
+size 467736

lily_llm_core/lily_llm_core (4).zip ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f3d10427cd12ff84032904e80ff66edfbd3ea1f4541fbcc091546eea12cd5722
+size 995879

lily_llm_core/lily_llm_core (5).zip ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:46564b960e68c833cd25844a9dd1292c1b42bff30e4926be09e153794fae5fc9
+size 997411

lily_llm_core/lily_llm_core (6).zip ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7674a856489f61a0c78c8cdf1584c74dcbb3db537d22d8af17062404fa11a37e
+size 990717

lily_llm_core/lily_llm_core.zip ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:47c145852d0b39b15d41e5b9eee35367c7a16d8f070465744ed2bfb3f2de9202
+size 962275

lily_llm_core/lora_manager.py CHANGED Viewed

@@ -167,17 +167,40 @@ class LoRAManager:
                     str(model_path),
                     trust_remote_code=True,
                     local_files_only=True,
-                    torch_dtype=torch.float16 if self.device == "cuda" else torch.float32,
                     device_map="auto" if self.device == "cuda" else None
                 )
             elif model_type == "vision2seq":
                 # 🔄 Vision2Seq 모델 지원 추가 (kanana 등, bfloat16 사용)
                 from transformers import AutoModelForVision2Seq
                 self.base_model = AutoModelForVision2Seq.from_pretrained(
                     str(model_path),
                     trust_remote_code=True,
                     local_files_only=True,
-                    torch_dtype=torch.bfloat16 if self.device == "cuda" else torch.bfloat16,
                     device_map="auto" if self.device == "cuda" else None
                 )
             else:

                     str(model_path),
                     trust_remote_code=True,
                     local_files_only=True,
+                    torch_dtype=(torch.float16 if self.device == "cuda" else torch.float32),
                     device_map="auto" if self.device == "cuda" else None
                 )
             elif model_type == "vision2seq":
                 # 🔄 Vision2Seq 모델 지원 추가 (kanana 등, bfloat16 사용)
                 from transformers import AutoModelForVision2Seq
+                # 환경변수로 dtype 제어: CPU 기본 float32
+                env_dtype = (os.getenv('LILY_FORCE_DTYPE') or os.getenv('LILY_CPU_DTYPE') if self.device=='cpu' else os.getenv('LILY_CUDA_DTYPE'))
+                if self.device == 'cuda':
+                    env_dtype = os.getenv('LILY_FORCE_DTYPE') or os.getenv('LILY_CUDA_DTYPE')
+                    selected_dtype = torch.bfloat16
+                    if env_dtype:
+                        m = env_dtype.lower()
+                        if m in ('float16','fp16'):
+                            selected_dtype = torch.float16
+                        elif m in ('float32','fp32'):
+                            selected_dtype = torch.float32
+                        elif m in ('bfloat16','bf16'):
+                            selected_dtype = torch.bfloat16
+                else:
+                    selected_dtype = torch.float32
+                    if env_dtype:
+                        m = env_dtype.lower()
+                        if m in ('float16','fp16'):
+                            selected_dtype = torch.float16
+                        elif m in ('float32','fp32'):
+                            selected_dtype = torch.float32
+                        elif m in ('bfloat16','bf16'):
+                            selected_dtype = torch.bfloat16
                 self.base_model = AutoModelForVision2Seq.from_pretrained(
                     str(model_path),
                     trust_remote_code=True,
                     local_files_only=True,
+                    torch_dtype=selected_dtype,
                     device_map="auto" if self.device == "cuda" else None
                 )
             else:

lily_llm_core/memory_store.py ADDED Viewed

	@@ -0,0 +1,179 @@

+#!/usr/bin/env python3
+"""
+Room/User memory store with pluggable backend
+- Preferred: SQLAlchemy with DATABASE URL (MySQL/Postgres/SQLite)
+- Fallback: SQLite (sqlite3) only if SQLAlchemy or URL not available
+"""
+import os
+import time
+from typing import Optional, Dict, Any
+# ---- SQLAlchemy backend (preferred) ----
+SQLA_AVAILABLE = False
+try:
+    from sqlalchemy import create_engine, Column, Text, String, Float, LargeBinary
+    from sqlalchemy.orm import declarative_base, sessionmaker
+    SQLA_AVAILABLE = True
+except Exception:
+    SQLA_AVAILABLE = False
+LILY_MEMORY_URL = os.getenv('LILY_MEMORY_URL')
+if SQLA_AVAILABLE and LILY_MEMORY_URL:
+    Base = declarative_base()
+    class RoomMemory(Base):
+        __tablename__ = 'room_memory'
+        room_id = Column(String(255), primary_key=True)
+        summary = Column(Text)
+        key_topics = Column(Text)
+        last_turn_ts = Column(Float)
+        updated_at = Column(Float)
+    class UserMemory(Base):
+        __tablename__ = 'user_memory'
+        user_id = Column(String(255), primary_key=True)
+        notes = Column(Text)
+        embedding = Column(LargeBinary)
+        updated_at = Column(Float)
+    _engine = create_engine(LILY_MEMORY_URL, echo=os.getenv('LILY_MEMORY_ECHO', '0') in ['1','true','True'])
+    Base.metadata.create_all(_engine)
+    _Session = sessionmaker(bind=_engine)
+    class MemoryStore:
+        def __init__(self) -> None:
+            pass
+        # ---- Room mid-term memory ----
+        def upsert_room_memory(self, room_id: str, summary: str, key_topics_csv: str, last_turn_ts: float) -> None:
+            now = time.time()
+            with _Session() as s:
+                obj = s.get(RoomMemory, str(room_id))
+                if obj is None:
+                    obj = RoomMemory(room_id=str(room_id))
+                obj.summary = summary or ''
+                obj.key_topics = key_topics_csv or ''
+                obj.last_turn_ts = float(last_turn_ts or now)
+                obj.updated_at = now
+                s.merge(obj)
+                s.commit()
+        def get_room_memory(self, room_id: str) -> Optional[Dict[str, Any]]:
+            with _Session() as s:
+                obj = s.get(RoomMemory, str(room_id))
+                if not obj:
+                    return None
+                return {"summary": obj.summary or '', "key_topics": obj.key_topics or ''}
+        # ---- User long-term memory ----
+        def upsert_user_memory(self, user_id: str, notes: str, embedding: Optional[bytes] = None) -> None:
+            now = time.time()
+            with _Session() as s:
+                obj = s.get(UserMemory, str(user_id))
+                if obj is None:
+                    obj = UserMemory(user_id=str(user_id))
+                obj.notes = notes or ''
+                obj.embedding = embedding
+                obj.updated_at = now
+                s.merge(obj)
+                s.commit()
+        def get_user_memory(self, user_id: str) -> Optional[Dict[str, Any]]:
+            with _Session() as s:
+                obj = s.get(UserMemory, str(user_id))
+                if not obj:
+                    return None
+                return {"notes": obj.notes or ''}
+    memory_store = MemoryStore()
+else:
+    # ---- SQLite fallback (sqlite3) ----
+    import sqlite3
+    DB_PATH = os.getenv('LILY_MEMORY_DB', 'data/memory/memory.db')
+    SCHEMA_SQL = """
+    CREATE TABLE IF NOT EXISTS room_memory (
+        room_id TEXT PRIMARY KEY,
+        summary TEXT,
+        key_topics TEXT,
+        last_turn_ts REAL,
+        updated_at REAL
+    );
+    CREATE TABLE IF NOT EXISTS user_memory (
+        user_id TEXT PRIMARY KEY,
+        notes TEXT,
+        embedding BLOB,
+        updated_at REAL
+    );
+    """
+    class MemoryStore:
+        def __init__(self, db_path: Optional[str] = None) -> None:
+            self.db_path = db_path or DB_PATH
+            os.makedirs(os.path.dirname(self.db_path), exist_ok=True)
+            self._init_db()
+        def _connect(self):
+            return sqlite3.connect(self.db_path)
+        def _init_db(self):
+            with self._connect() as conn:
+                conn.executescript(SCHEMA_SQL)
+                conn.commit()
+        # ---- Room mid-term memory ----
+        def upsert_room_memory(self, room_id: str, summary: str, key_topics_csv: str, last_turn_ts: float) -> None:
+            now = time.time()
+            with self._connect() as conn:
+                conn.execute(
+                    """
+                    INSERT INTO room_memory(room_id, summary, key_topics, last_turn_ts, updated_at)
+                    VALUES(?,?,?,?,?)
+                    ON CONFLICT(room_id) DO UPDATE SET
+                      summary=excluded.summary,
+                      key_topics=excluded.key_topics,
+                      last_turn_ts=excluded.last_turn_ts,
+                      updated_at=excluded.updated_at
+                    """,
+                    (str(room_id), summary or '', key_topics_csv or '', float(last_turn_ts or now), now)
+                )
+                conn.commit()
+        def get_room_memory(self, room_id: str) -> Optional[Dict[str, Any]]:
+            with self._connect() as conn:
+                cur = conn.execute("SELECT summary, key_topics FROM room_memory WHERE room_id=?", (str(room_id),))
+                row = cur.fetchone()
+                if not row:
+                    return None
+                return {"summary": row[0] or '', "key_topics": row[1] or ''}
+        # ---- User long-term memory ----
+        def upsert_user_memory(self, user_id: str, notes: str, embedding: Optional[bytes] = None) -> None:
+            now = time.time()
+            with self._connect() as conn:
+                conn.execute(
+                    """
+                    INSERT INTO user_memory(user_id, notes, embedding, updated_at)
+                    VALUES(?,?,?,?)
+                    ON CONFLICT(user_id) DO UPDATE SET
+                      notes=excluded.notes,
+                      embedding=excluded.embedding,
+                      updated_at=excluded.updated_at
+                    """,
+                    (str(user_id), notes or '', embedding, now)
+                )
+                conn.commit()
+        def get_user_memory(self, user_id: str) -> Optional[Dict[str, Any]]:
+            with self._connect() as conn:
+                cur = conn.execute("SELECT notes FROM user_memory WHERE user_id=?", (str(user_id),))
+                row = cur.fetchone()
+                if not row:
+                    return None
+                return {"notes": row[0] or ''}
+    memory_store = MemoryStore()

requirements.txt CHANGED Viewed

@@ -43,6 +43,8 @@ python-docx
 python-pptx
 pytesseract
 sqlalchemy
 celery
 redis
 python-jose[cryptography]

 python-pptx
 pytesseract
 sqlalchemy
+psycopg2-binary
+PyMySQL
 celery
 redis
 python-jose[cryptography]