import os
import traceback
from typing import Optional
from transformers import AutoTokenizer
import torch

# 환경 변수 로드
try:
    from dotenv import load_dotenv
    load_dotenv()
    print("✅ .env 파일 로드됨")
except ImportError:
    print("⚠️ python-dotenv가 설치되지 않음")

HF_TOKEN = os.getenv("HF_TOKEN")

# 환경 감지
IS_LOCAL = os.path.exists('../.env') or 'LOCAL_TEST' in os.environ
print(f"🔍 환경: {'로컬' if IS_LOCAL else '서버'}")

# 환경에 따른 모델 경로 설정
if IS_LOCAL:
    # 로컬 모델 경로 (hearth_llm_model 폴더 사용)
    MODEL_PATH = "../lily_llm_core/models/kanana-1.5-v-3b-instruct"
    print(f"🔍 로컬 모델 경로: {MODEL_PATH}")
    print(f"🔍 경로 존재: {os.path.exists(MODEL_PATH)}")
else:
    # 서버에서는 Hugging Face 모델 사용
    MODEL_PATH = os.getenv("MODEL_NAME", "gbrabbit/lily-math-model")
    print(f"🔍 서버 모델: {MODEL_PATH}")

print(f"🔍 토큰: {'✅ 설정됨' if HF_TOKEN else '❌ 설정되지 않음'}")

# 토크나이저 테스트
print("\n🔧 토크나이저 테스트 시작...")

try:
    print("📤 토크나이저 로딩 중...")
    print(f"   MODEL_PATH: {MODEL_PATH}")
    print(f"   IS_LOCAL: {IS_LOCAL}")
    print(f"   trust_remote_code: True")
    print(f"   use_fast: False")
    
    if IS_LOCAL:
        tokenizer = AutoTokenizer.from_pretrained(
            MODEL_PATH,
            trust_remote_code=True,            
        )
    else:
        tokenizer = AutoTokenizer.from_pretrained(
            MODEL_PATH,
            token=HF_TOKEN,
            trust_remote_code=True,            
        )
    
    print(f"✅ 토크나이저 로딩 완료")
    print(f"   타입: {type(tokenizer)}")
    print(f"   값: {tokenizer}")
    print(f"   hasattr('encode'): {hasattr(tokenizer, 'encode')}")
    print(f"   hasattr('__call__'): {hasattr(tokenizer, '__call__')}")
    
    # 토크나이저 테스트
    test_input = "안녕하세요"
    print(f"\n🔤 토크나이저 테스트: '{test_input}'")
    
    test_tokens = tokenizer(test_input, return_tensors="pt")
    print(f"   ✅ 토크나이저 호출 성공")
    print(f"   input_ids shape: {test_tokens['input_ids'].shape}")
    print(f"   attention_mask shape: {test_tokens['attention_mask'].shape}")
    
    # 디코딩 테스트
    decoded = tokenizer.decode(test_tokens['input_ids'][0], skip_special_tokens=True)
    print(f"   디코딩 결과: '{decoded}'")
    
except Exception as e:
    print(f"❌ 토크나이저 테스트 실패: {e}")
    print(f"   오류 타입: {type(e).__name__}")
    traceback.print_exc()

# 모델 테스트
print("\n🔧 모델 테스트 시작...")

try:
    print("📤 모델 로딩 중...")
    from modeling import KananaVForConditionalGeneration
    
    if IS_LOCAL:
        model = KananaVForConditionalGeneration.from_pretrained(
            MODEL_PATH,
            torch_dtype=torch.float16,
            trust_remote_code=True,
            device_map=None,
            low_cpu_mem_usage=True
        )
    else:
        model = KananaVForConditionalGeneration.from_pretrained(
            MODEL_PATH,
            token=HF_TOKEN,
            torch_dtype=torch.float16,
            trust_remote_code=True,
            device_map=None,
            low_cpu_mem_usage=True
        )
    
    print(f"✅ 모델 로딩 완료")
    # print(f"   타입: {type(model)}")
    # print(f"   디바이스: {next(model.parameters()).device}")
    
    # 모델 테스트
    test_input = "안녕하세요"
    formatted_prompt = f"<|im_start|>user\n{test_input}<|im_end|>\n<|im_start|>assistant\n"
    max_length: Optional[int] = None
    
    inputs = tokenizer(
        formatted_prompt, 
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=512
    )
    
    print(f"\n🤖 모델 추론 테스트: '{test_input}'")
    
    # Kanana용 생성 설정
    max_new_tokens = max_length or 100
                    
    with torch.no_grad():
        outputs = model.generate(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            max_new_tokens=max_new_tokens,                        
            repetition_penalty=1.1,
            no_repeat_ngram_size=2,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id,
            use_cache=True
        )
    
    print(f"   ✅ 모델 호출 성공")
    print(f"   outputs 타입: {type(outputs)}")
    print(f"   outputs shape: {outputs.shape}")

    # 디코딩 테스트
    # model.generate()의 출력은 전체 시퀀스이므로 바로 디코딩합니다.
    # outputs[0]은 배치 중 첫 번째 결과를 의미합니다.
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # 입력 프롬프트를 응답에서 제거 (선택사항)
    assistant_response = response.split("<|im_start|>assistant\n")[-1]
    
    print(f"   생성된 전체 텍스트: '{response}'")
    print(f"   어시스턴트 응답: '{assistant_response.strip()}'")
    
except Exception as e:
    print(f"❌ 모델 테스트 실패: {e}")
    print(f"   오류 타입: {type(e).__name__}")
    traceback.print_exc()

print("\n✅ 테스트 완료!")