{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[2025-06-18 07:13:16,579] [INFO] [real_accelerator.py:239:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/mnt/jeff/anaconda/compiler_compat/ld: cannot find -laio: No such file or directory\n",
      "collect2: error: ld returned 1 exit status\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: warning: librt.so.1, needed by /usr/local/cuda/lib64/libcufile.so, not found (try using -rpath or -rpath-link)\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: warning: libpthread.so.0, needed by /usr/local/cuda/lib64/libcufile.so, not found (try using -rpath or -rpath-link)\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: warning: libstdc++.so.6, needed by /usr/local/cuda/lib64/libcufile.so, not found (try using -rpath or -rpath-link)\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: warning: libm.so.6, needed by /usr/local/cuda/lib64/libcufile.so, not found (try using -rpath or -rpath-link)\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::runtime_error::~runtime_error()@GLIBCXX_3.4'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `__gxx_personality_v0@CXXABI_1.3'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream::tellp()@GLIBCXX_3.4'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::chrono::_V2::steady_clock::now()@GLIBCXX_3.4.19'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::_M_replace_aux(unsigned long, unsigned long, unsigned long, char)@GLIBCXX_3.4'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for bool@CXXABI_1.3'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::__throw_logic_error(char const*)@GLIBCXX_3.4'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `VTT for std::basic_ostringstream<char, std::char_traits<char>, std::allocator<char> >@GLIBCXX_3.4'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::logic_error@GLIBCXX_3.4'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::locale::~locale()@GLIBCXX_3.4'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_string<char, std::char_traits<char>, std::allocator<char> >::basic_string(std::string const&, unsigned long, unsigned long)@GLIBCXX_3.4'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `__cxa_end_catch@CXXABI_1.3'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `VTT for std::basic_ofstream<char, std::char_traits<char> >@GLIBCXX_3.4'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::logic_error::~logic_error()@GLIBCXX_3.4'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for __cxxabiv1::__si_class_type_info@CXXABI_1.3'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_ios<char, std::char_traits<char> >::_M_cache_locale(std::locale const&)@GLIBCXX_3.4'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `VTT for std::basic_stringstream<char, std::char_traits<char>, std::allocator<char> >@GLIBCXX_3.4'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `operator new[](unsigned long)@GLIBCXX_3.4'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::_M_leak_hard()@GLIBCXX_3.4'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_ifstream<char, std::char_traits<char> >@GLIBCXX_3.4'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_streambuf<wchar_t, std::char_traits<wchar_t> >::basic_streambuf(std::basic_streambuf<wchar_t, std::char_traits<wchar_t> > const&)@GLIBCXX_3.4'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::append(char const*, unsigned long)@GLIBCXX_3.4'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_string<char, std::char_traits<char>, std::allocator<char> >::basic_string(std::string const&)@GLIBCXX_3.4'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for unsigned short@CXXABI_1.3'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::resize(unsigned long, char)@GLIBCXX_3.4'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for char const*@CXXABI_1.3'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ctype<char>::_M_widen_init() const@GLIBCXX_3.4.11'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::__throw_invalid_argument(char const*)@GLIBCXX_3.4'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::locale::operator=(std::locale const&)@GLIBCXX_3.4'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_ios<wchar_t, std::char_traits<wchar_t> >::_M_cache_locale(std::locale const&)@GLIBCXX_3.4'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::_Rb_tree_decrement(std::_Rb_tree_node_base const*)@GLIBCXX_3.4'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `__cxa_free_exception@CXXABI_1.3'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::condition_variable::notify_one()@GLIBCXX_3.4.11'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ios_base::Init::~Init()@GLIBCXX_3.4'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_string<char, std::char_traits<char>, std::allocator<char> >::~basic_string()@GLIBCXX_3.4'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `__cxa_pure_virtual@CXXABI_1.3'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream::flush()@GLIBCXX_3.4'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for __cxxabiv1::__class_type_info@CXXABI_1.3'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `__cxa_rethrow@CXXABI_1.3'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_stringbuf<char, std::char_traits<char>, std::allocator<char> >@GLIBCXX_3.4'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_fstream<char, std::char_traits<char> >::~basic_fstream()@GLIBCXX_3.4'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::compare(char const*) const@GLIBCXX_3.4'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `VTT for std::basic_ostringstream<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> >@GLIBCXX_3.4'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::locale::locale()@GLIBCXX_3.4'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::chrono::_V2::system_clock::now()@GLIBCXX_3.4.19'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `VTT for std::basic_ifstream<char, std::char_traits<char> >@GLIBCXX_3.4'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::_Hash_bytes(void const*, unsigned long, unsigned long)@CXXABI_1.3.5'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream& std::ostream::_M_insert<long long>(long long)@GLIBCXX_3.4.9'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for char*@CXXABI_1.3'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::__detail::_Prime_rehash_policy::_M_need_rehash(unsigned long, unsigned long, unsigned long) const@GLIBCXX_3.4.18'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::out_of_range@GLIBCXX_3.4'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream& std::ostream::_M_insert<unsigned long>(unsigned long)@GLIBCXX_3.4.9'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::_Rb_tree_increment(std::_Rb_tree_node_base const*)@GLIBCXX_3.4'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ios_base::~ios_base()@GLIBCXX_3.4'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::range_error::~range_error()@GLIBCXX_3.4'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::__basic_file<char>::~__basic_file()@GLIBCXX_3.4'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `__cxa_guard_acquire@CXXABI_1.3'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream& std::ostream::_M_insert<bool>(bool)@GLIBCXX_3.4.9'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::overflow_error@GLIBCXX_3.4'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `VTT for std::basic_fstream<char, std::char_traits<char> >@GLIBCXX_3.4'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::range_error@GLIBCXX_3.4'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_ios<char, std::char_traits<char> >@GLIBCXX_3.4'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_filebuf<char, std::char_traits<char> >@GLIBCXX_3.4'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `operator delete[](void*)@GLIBCXX_3.4'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_stringstream<char, std::char_traits<char>, std::allocator<char> >@GLIBCXX_3.4'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_string<char, std::char_traits<char>, std::allocator<char> >::basic_string(unsigned long, char, std::allocator<char> const&)@GLIBCXX_3.4'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::__detail::_List_node_base::_M_transfer(std::__detail::_List_node_base*, std::__detail::_List_node_base*)@GLIBCXX_3.4.15'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::replace(unsigned long, unsigned long, char const*, unsigned long)@GLIBCXX_3.4'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for std::exception@GLIBCXX_3.4'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_string<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> >::_Rep::_M_destroy(std::allocator<wchar_t> const&)@GLIBCXX_3.4'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::istream& std::istream::_M_extract<double>(double&)@GLIBCXX_3.4.9'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_filebuf<char, std::char_traits<char> >::close()@GLIBCXX_3.4'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_fstream<char, std::char_traits<char> >@GLIBCXX_3.4'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_ifstream<char, std::char_traits<char> >::basic_ifstream(char const*, std::_Ios_Openmode)@GLIBCXX_3.4'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::append(std::string const&)@GLIBCXX_3.4'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `operator new(unsigned long)@GLIBCXX_3.4'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `VTT for std::basic_istringstream<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> >@GLIBCXX_3.4'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for unsigned int@CXXABI_1.3'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::append(char const*)@GLIBCXX_3.4'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::domain_error@GLIBCXX_3.4'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::find(char, unsigned long) const@GLIBCXX_3.4'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream::put(char)@GLIBCXX_3.4'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for int@CXXABI_1.3'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::__throw_bad_alloc()@GLIBCXX_3.4'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `__cxa_thread_atexit@CXXABI_1.3.7'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for unsigned int*@CXXABI_1.3'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::_Rb_tree_increment(std::_Rb_tree_node_base*)@GLIBCXX_3.4'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_ifstream<char, std::char_traits<char> >::~basic_ifstream()@GLIBCXX_3.4'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ios_base::Init::Init()@GLIBCXX_3.4'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::condition_variable::condition_variable()@GLIBCXX_3.4.11'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_filebuf<char, std::char_traits<char> >::basic_filebuf()@GLIBCXX_3.4'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `VTT for std::basic_istringstream<char, std::char_traits<char>, std::allocator<char> >@GLIBCXX_3.4'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::domain_error::~domain_error()@GLIBCXX_3.4'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::cerr@GLIBCXX_3.4'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::find(char const*, unsigned long, unsigned long) const@GLIBCXX_3.4'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_istringstream<char, std::char_traits<char>, std::allocator<char> >@GLIBCXX_3.4'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_string<char, std::char_traits<char>, std::allocator<char> >::basic_string(std::allocator<char> const&)@GLIBCXX_3.4'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_stringbuf<char, std::char_traits<char>, std::allocator<char> >::str() const@GLIBCXX_3.4'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::invalid_argument@GLIBCXX_3.4'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for void*@CXXABI_1.3'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::assign(std::string const&)@GLIBCXX_3.4'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_ostringstream<char, std::char_traits<char>, std::allocator<char> >::~basic_ostringstream()@GLIBCXX_3.4'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::_Rb_tree_rebalance_for_erase(std::_Rb_tree_node_base*, std::_Rb_tree_node_base&)@GLIBCXX_3.4'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for unsigned long@CXXABI_1.3'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::__detail::_List_node_base::_M_hook(std::__detail::_List_node_base*)@GLIBCXX_3.4.15'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::__detail::_List_node_base::_M_unhook()@GLIBCXX_3.4.15'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_ostringstream<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> >@GLIBCXX_3.4'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_stringbuf<char, std::char_traits<char>, std::allocator<char> >::_M_sync(char*, unsigned long, unsigned long)@GLIBCXX_3.4'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_iostream<char, std::char_traits<char> >::~basic_iostream()@GLIBCXX_3.4'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::locale::locale(std::locale const&)@GLIBCXX_3.4'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_istringstream<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> >@GLIBCXX_3.4'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `log2f@GLIBC_2.2.5'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream::operator<<(std::basic_streambuf<char, std::char_traits<char> >*)@GLIBCXX_3.4'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_streambuf<wchar_t, std::char_traits<wchar_t> >@GLIBCXX_3.4'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::exception::~exception()@GLIBCXX_3.4'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::_Rep::_S_create(unsigned long, unsigned long, std::allocator<char> const&)@GLIBCXX_3.4'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::__basic_file<char>::is_open() const@GLIBCXX_3.4'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_istringstream<char, std::char_traits<char>, std::allocator<char> >::~basic_istringstream()@GLIBCXX_3.4'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::swap(std::string&)@GLIBCXX_3.4'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_ostringstream<char, std::char_traits<char>, std::allocator<char> >@GLIBCXX_3.4'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_streambuf<char, std::char_traits<char> >::basic_streambuf(std::basic_streambuf<char, std::char_traits<char> > const&)@GLIBCXX_3.4'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_ios<char, std::char_traits<char> >::init(std::basic_streambuf<char, std::char_traits<char> >*)@GLIBCXX_3.4'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::__throw_bad_cast()@GLIBCXX_3.4'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_ios<char, std::char_traits<char> >::clear(std::_Ios_Iostate)@GLIBCXX_3.4'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_streambuf<wchar_t, std::char_traits<wchar_t> >::operator=(std::basic_streambuf<wchar_t, std::char_traits<wchar_t> > const&)@GLIBCXX_3.4'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for long*@CXXABI_1.3'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `operator delete(void*)@GLIBCXX_3.4'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream::operator<<(int)@GLIBCXX_3.4'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::_Rep::_S_empty_rep_storage@GLIBCXX_3.4'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::_Rep::_M_destroy(std::allocator<char> const&)@GLIBCXX_3.4'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_iostream<wchar_t, std::char_traits<wchar_t> >::~basic_iostream()@GLIBCXX_3.4'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::runtime_error@GLIBCXX_3.4'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_ofstream<char, std::char_traits<char> >@GLIBCXX_3.4'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::_Rb_tree_insert_and_rebalance(bool, std::_Rb_tree_node_base*, std::_Rb_tree_node_base*, std::_Rb_tree_node_base&)@GLIBCXX_3.4'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_stringstream<char, std::char_traits<char>, std::allocator<char> >::~basic_stringstream()@GLIBCXX_3.4'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `VTT for std::basic_stringstream<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> >@GLIBCXX_3.4'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream& std::ostream::_M_insert<long>(long)@GLIBCXX_3.4.9'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::istream::get()@GLIBCXX_3.4'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for unsigned long long@CXXABI_1.3'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_ostream<char, std::char_traits<char> >& std::operator<< <std::char_traits<char> >(std::basic_ostream<char, std::char_traits<char> >&, char const*)@GLIBCXX_3.4'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::out_of_range::~out_of_range()@GLIBCXX_3.4'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::length_error::~length_error()@GLIBCXX_3.4'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_ostream<char, std::char_traits<char> >& std::__ostream_insert<char, std::char_traits<char> >(std::basic_ostream<char, std::char_traits<char> >&, char const*, long)@GLIBCXX_3.4.9'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::invalid_argument::~invalid_argument()@GLIBCXX_3.4'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_string<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> >::swap(std::basic_string<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> >&)@GLIBCXX_3.4'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::cout@GLIBCXX_3.4'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream& std::ostream::_M_insert<unsigned long long>(unsigned long long)@GLIBCXX_3.4.9'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream& std::ostream::_M_insert<void const*>(void const*)@GLIBCXX_3.4.9'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::underflow_error@GLIBCXX_3.4'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_streambuf<char, std::char_traits<char> >@GLIBCXX_3.4'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for std::out_of_range@GLIBCXX_3.4'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `__cxa_allocate_exception@CXXABI_1.3'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_ios<wchar_t, std::char_traits<wchar_t> >@GLIBCXX_3.4'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for void const*@CXXABI_1.3'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_ios<wchar_t, std::char_traits<wchar_t> >::init(std::basic_streambuf<wchar_t, std::char_traits<wchar_t> >*)@GLIBCXX_3.4'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::reserve(unsigned long)@GLIBCXX_3.4'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `__cxa_begin_catch@CXXABI_1.3'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for long@CXXABI_1.3'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_string<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> >::_Rep::_S_empty_rep_storage@GLIBCXX_3.4'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::_M_leak()@GLIBCXX_3.4'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_filebuf<char, std::char_traits<char> >::open(char const*, std::_Ios_Openmode)@GLIBCXX_3.4'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_stringbuf<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> >::_M_sync(wchar_t*, unsigned long, unsigned long)@GLIBCXX_3.4'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::istream::getline(char*, long, char)@GLIBCXX_3.4'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_istream<char, std::char_traits<char> >& std::getline<char, std::char_traits<char>, std::allocator<char> >(std::basic_istream<char, std::char_traits<char> >&, std::basic_string<char, std::char_traits<char>, std::allocator<char> >&, char)@GLIBCXX_3.4'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_stringstream<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> >@GLIBCXX_3.4'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::condition_variable::~condition_variable()@GLIBCXX_3.4.11'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_stringbuf<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> >@GLIBCXX_3.4'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::insert(unsigned long, char const*, unsigned long)@GLIBCXX_3.4'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::assign(char const*, unsigned long)@GLIBCXX_3.4'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for unsigned char@CXXABI_1.3'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ios_base::ios_base()@GLIBCXX_3.4'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::__throw_out_of_range(char const*)@GLIBCXX_3.4'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::overflow_error::~overflow_error()@GLIBCXX_3.4'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::__throw_length_error(char const*)@GLIBCXX_3.4'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::__throw_system_error(int)@GLIBCXX_3.4.11'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_ofstream<char, std::char_traits<char> >::close()@GLIBCXX_3.4'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream& std::ostream::_M_insert<double>(double)@GLIBCXX_3.4.9'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_streambuf<char, std::char_traits<char> >::operator=(std::basic_streambuf<char, std::char_traits<char> > const&)@GLIBCXX_3.4'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for long long@CXXABI_1.3'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_string<char, std::char_traits<char>, std::allocator<char> >::basic_string(char const*, unsigned long, std::allocator<char> const&)@GLIBCXX_3.4'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_ifstream<char, std::char_traits<char> >::close()@GLIBCXX_3.4'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `__cxa_guard_release@CXXABI_1.3'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `__cxa_throw@CXXABI_1.3'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::underflow_error::~underflow_error()@GLIBCXX_3.4'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::_Rb_tree_decrement(std::_Rb_tree_node_base*)@GLIBCXX_3.4'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::length_error@GLIBCXX_3.4'\n",
      "/mnt/jeff/anaconda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_filebuf<char, std::char_traits<char> >::~basic_filebuf()@GLIBCXX_3.4'\n",
      "collect2: error: ld returned 1 exit status\n",
      "/mnt/jeff/huggingface/modules/transformers_modules/speech_conformer_encoder.py:2775: FutureWarning: Please specify CheckpointImpl.NO_REENTRANT as CheckpointImpl.REENTRANT will soon be removed as the default and eventually deprecated.\n",
      "  lambda i: encoder_checkpoint_wrapper(\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "######################## speech lora #############\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "f0f347f300534909b2d5db8ca3ea5df4",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.\n"
     ]
    }
   ],
   "source": [
    "from io import BytesIO\n",
    "from urllib.request import urlopen\n",
    "import soundfile\n",
    "import torch\n",
    "from datasets import load_dataset, Audio\n",
    "import numpy as np\n",
    "from transformers import AutoModel, AutoProcessor, BatchFeature,Gemma3ForCausalLM,Gemma3Processor\n",
    "from tqdm import tqdm\n",
    "import json\n",
    "import os\n",
    "import time\n",
    "from datetime import datetime\n",
    "from whisper_normalizer.english import EnglishTextNormalizer\n",
    "from whisper_normalizer.basic import BasicTextNormalizer\n",
    "import sacrebleu\n",
    "from jiwer import cer, wer\n",
    "from torch.utils.data import Dataset, DataLoader\n",
    "import soundfile as sf\n",
    "import re\n",
    "from pathlib import Path\n",
    "import opencc\n",
    "from ASRDataset import *\n",
    "\n",
    "# converter = opencc.OpenCC('s2tw.json')\n",
    "\n",
    "model_id = \"./\"\n",
    "revision = \"main\" #\"v1.0\"\n",
    "\n",
    "model = AutoModel.from_pretrained(\n",
    "    model_id, device_map=\"cuda\", revision = revision, trust_remote_code=True\n",
    ").eval()\n",
    "\n",
    "processor = AutoProcessor.from_pretrained(\n",
    "    model_id, revision = revision, trust_remote_code=True\n",
    ")\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "pickup_dataset = MultiturnAudioDataset(split='eval',processor=processor,json_path='/mnt/jeff/InCar/data/multiturn_data/pickup_processed.json')\n",
    "dataloader = DataLoader(pickup_dataset, batch_size=1, shuffle=False, collate_fn=covost_collate_fn)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  0%|          | 0/434 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[ 1.1206e-04, -1.1301e-04, -2.7299e-05,  ..., -9.0599e-06,\n",
      "           1.3611e-02,  2.9325e-05],\n",
      "         [ 1.1206e-04, -1.1301e-04, -2.7299e-05,  ..., -9.0599e-06,\n",
      "           1.3611e-02,  2.9325e-05],\n",
      "         [ 1.1206e-04, -1.1301e-04, -2.7299e-05,  ..., -9.0599e-06,\n",
      "           1.3611e-02,  2.9325e-05],\n",
      "         ...,\n",
      "         [ 1.1206e-04, -1.1301e-04, -2.7299e-05,  ..., -9.0599e-06,\n",
      "           1.3611e-02,  2.9325e-05],\n",
      "         [ 8.3447e-05, -2.9297e-02, -4.3678e-04,  ..., -1.2146e-02,\n",
      "           1.0681e-02, -2.3193e-03],\n",
      "         [-7.7438e-04, -7.7209e-03,  1.2665e-03,  ...,  5.5313e-04,\n",
      "          -1.1841e-02, -2.6093e-03]]], device='cuda:0')\n",
      "audio_attention_mask None\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/mnt/jeff/anaconda/lib/python3.12/site-packages/torch/utils/checkpoint.py:87: UserWarning: None of the inputs have requires_grad=True. Gradients will be None\n",
      "  warnings.warn(\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "audio_features tensor([[[-0.3780, -0.7710,  0.3672,  ..., -0.5870,  0.4069,  0.8486],\n",
      "         [ 0.1079,  0.1348, -0.6116,  ..., -2.2154,  0.5705,  0.8937],\n",
      "         [-1.0184,  1.8919,  0.3304,  ...,  0.3861,  0.6337, -0.4413],\n",
      "         ...,\n",
      "         [-0.0304,  0.0203, -0.0488,  ...,  0.0108, -0.0134,  0.0664],\n",
      "         [-1.0408,  0.2857,  0.1969,  ...,  0.0895, -1.0475,  0.4363],\n",
      "         [-1.8609, -0.4888, -0.2397,  ..., -1.5569, -1.0248, -0.4421]]],\n",
      "       device='cuda:0')\n",
      "audio_features tensor(0, device='cuda:0') tensor([[[3.5189e+22, 4.5876e-41, 3.6358e+22,  ..., 4.5876e-41,\n",
      "          3.5988e+22, 4.5876e-41],\n",
      "         [3.5189e+22, 4.5876e-41, 3.6358e+22,  ..., 4.5876e-41,\n",
      "          3.5988e+22, 4.5876e-41],\n",
      "         [3.5189e+22, 4.5876e-41, 3.6358e+22,  ..., 4.5876e-41,\n",
      "          3.5988e+22, 4.5876e-41],\n",
      "         ...,\n",
      "         [3.5189e+22, 4.5876e-41, 3.6358e+22,  ..., 4.5876e-41,\n",
      "          3.5988e+22, 4.5876e-41],\n",
      "         [3.5189e+22, 4.5876e-41, 3.6358e+22,  ..., 4.5876e-41,\n",
      "          3.5988e+22, 4.5876e-41],\n",
      "         [3.5189e+22, 4.5876e-41, 3.6358e+22,  ..., 4.5876e-41,\n",
      "          3.5988e+22, 4.5876e-41]]], device='cuda:0')\n",
      "masked_audio_features tensor(0, device='cuda:0') tensor([[3.5189e+22, 4.5876e-41, 3.6358e+22,  ..., 4.5876e-41, 3.5988e+22,\n",
      "         4.5876e-41],\n",
      "        [3.5189e+22, 4.5876e-41, 3.6358e+22,  ..., 4.5876e-41, 3.5988e+22,\n",
      "         4.5876e-41],\n",
      "        [3.5189e+22, 4.5876e-41, 3.6358e+22,  ..., 4.5876e-41, 3.5988e+22,\n",
      "         4.5876e-41],\n",
      "        ...,\n",
      "        [3.5189e+22, 4.5876e-41, 3.6358e+22,  ..., 4.5876e-41, 3.5988e+22,\n",
      "         4.5876e-41],\n",
      "        [3.5189e+22, 4.5876e-41, 3.6358e+22,  ..., 4.5876e-41, 3.5988e+22,\n",
      "         4.5876e-41],\n",
      "        [3.5189e+22, 4.5876e-41, 3.6358e+22,  ..., 4.5876e-41, 3.5988e+22,\n",
      "         4.5876e-41]], device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[ 1.1206e-04, -1.1301e-04, -2.7299e-05,  ..., -9.0599e-06,\n",
      "           1.3611e-02,  2.9325e-05],\n",
      "         [ 1.1206e-04, -1.1301e-04, -2.7299e-05,  ..., -9.0599e-06,\n",
      "           1.3611e-02,  2.9325e-05],\n",
      "         [ 1.1206e-04, -1.1301e-04, -2.7299e-05,  ..., -9.0599e-06,\n",
      "           1.3611e-02,  2.9325e-05],\n",
      "         ...,\n",
      "         [ 1.1206e-04, -1.1301e-04, -2.7299e-05,  ..., -9.0599e-06,\n",
      "           1.3611e-02,  2.9325e-05],\n",
      "         [ 8.3447e-05, -2.9297e-02, -4.3678e-04,  ..., -1.2146e-02,\n",
      "           1.0681e-02, -2.3193e-03],\n",
      "         [-7.7438e-04, -7.7209e-03,  1.2665e-03,  ...,  5.5313e-04,\n",
      "          -1.1841e-02, -2.6093e-03]]], device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[-0.7934,  2.2984,  4.6432,  ..., -1.8128, -1.8129, -1.8130]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0186,  0.0043,  0.0227,  ...,  0.0088, -0.0049,  0.0025]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0186,  0.0043,  0.0227,  ...,  0.0088, -0.0049,  0.0025]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 2.2565,  2.7995, 12.6682,  ..., -4.1137, -4.1120, -4.1128]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[ 0.0011,  0.0184,  0.0028,  ...,  0.0006, -0.0128, -0.0151]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[ 0.0011,  0.0184,  0.0028,  ...,  0.0006, -0.0128, -0.0151]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[-1.0887, -5.8322,  0.7939,  ..., -0.5516, -0.5513, -0.5520]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[ 0.0113, -0.0161,  0.0079,  ...,  0.0197, -0.0008, -0.0082]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[ 0.0113, -0.0161,  0.0079,  ...,  0.0197, -0.0008, -0.0082]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 7.4639,  3.4059,  1.3521,  ..., -2.6798, -2.6809, -2.6811]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-2.3007e-05,  5.9891e-04, -5.3787e-04,  ...,  4.5776e-04,\n",
      "           5.6839e-04, -2.7275e-04]]], device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-2.3007e-05,  5.9891e-04, -5.3787e-04,  ...,  4.5776e-04,\n",
      "           5.6839e-04, -2.7275e-04]]], device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 3.9671, -3.2369,  0.9566,  ..., -1.4895, -1.4896, -1.4902]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[ 0.0104,  0.0069, -0.0060,  ...,  0.0010,  0.0042, -0.0015]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[ 0.0104,  0.0069, -0.0060,  ...,  0.0010,  0.0042, -0.0015]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 2.2403,  0.2873,  2.4648,  ..., -1.0528, -1.0534, -1.0542]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0021, -0.0007,  0.0070,  ..., -0.0084, -0.0083, -0.0017]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0021, -0.0007,  0.0070,  ..., -0.0084, -0.0083, -0.0017]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 4.4957,  2.0423,  0.7912,  ..., -2.3032, -2.3031, -2.3025]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0124, -0.0187, -0.0081,  ..., -0.0090, -0.0092,  0.0231]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0124, -0.0187, -0.0081,  ..., -0.0090, -0.0092,  0.0231]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 1.8442, -0.7132,  1.0167,  ..., -1.9491, -1.9485, -1.9490]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[ 0.0015, -0.0172, -0.0028,  ..., -0.0081,  0.0011, -0.0062]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[ 0.0015, -0.0172, -0.0028,  ..., -0.0081,  0.0011, -0.0062]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 0.7503, -1.2019, -2.7825,  ..., -1.4378, -1.4368, -1.4375]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0204, -0.0112,  0.0025,  ..., -0.0184,  0.0095,  0.0031]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0204, -0.0112,  0.0025,  ..., -0.0184,  0.0095,  0.0031]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 2.7866,  5.1949,  1.4227,  ..., -2.1833, -2.1827, -2.1830]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0131, -0.0034, -0.0005,  ...,  0.0061, -0.0068, -0.0160]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0131, -0.0034, -0.0005,  ...,  0.0061, -0.0068, -0.0160]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 7.4504,  0.9829,  3.5831,  ..., -2.8693, -2.8691, -2.8692]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0030,  0.0056,  0.0034,  ..., -0.0063, -0.0016,  0.0013]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0030,  0.0056,  0.0034,  ..., -0.0063, -0.0016,  0.0013]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[-0.1066, -0.4927,  0.0675,  ..., -1.7429, -1.7428, -1.7433]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0010, -0.0075, -0.0001,  ...,  0.0001, -0.0011, -0.0019]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0010, -0.0075, -0.0001,  ...,  0.0001, -0.0011, -0.0019]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 0.7224,  1.7236, -0.3631,  ..., -1.1573, -1.1571, -1.1564]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0029, -0.0006, -0.0219,  ..., -0.0032,  0.0011,  0.0075]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0029, -0.0006, -0.0219,  ..., -0.0032,  0.0011,  0.0075]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 2.9264, -1.4750, -0.0503,  ..., -0.5115, -0.5112, -0.5118]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0034,  0.0038, -0.0043,  ..., -0.0071,  0.0030, -0.0153]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0034,  0.0038, -0.0043,  ..., -0.0071,  0.0030, -0.0153]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 2.2001,  0.3211, -0.7397,  ..., -1.4245, -1.4242, -1.4246]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0009, -0.0044, -0.0095,  ..., -0.0183,  0.0014,  0.0052]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0009, -0.0044, -0.0095,  ..., -0.0183,  0.0014,  0.0052]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 6.0811,  1.3457,  3.4966,  ..., -1.9978, -1.9977, -1.9982]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[ 0.0067, -0.0047,  0.0002,  ...,  0.0022, -0.0071,  0.0003]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[ 0.0067, -0.0047,  0.0002,  ...,  0.0022, -0.0071,  0.0003]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 3.3244,  5.0375, 10.6473,  ..., -0.8292, -0.8280, -0.8281]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[ 0.0112, -0.0073,  0.0118,  ..., -0.0182,  0.0040,  0.0034]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[ 0.0112, -0.0073,  0.0118,  ..., -0.0182,  0.0040,  0.0034]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[-0.5047,  2.7371,  3.7994,  ...,  1.6258,  1.6262,  1.6260]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0195, -0.0029,  0.0133,  ..., -0.0145,  0.0056,  0.0117]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0195, -0.0029,  0.0133,  ..., -0.0145,  0.0056,  0.0117]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 1.9971, -2.0421,  1.8777,  ..., -2.1895, -2.1888, -2.1893]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0093, -0.0107,  0.0162,  ..., -0.0099,  0.0160,  0.0139]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0093, -0.0107,  0.0162,  ..., -0.0099,  0.0160,  0.0139]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 1.2119,  2.1697, 12.1325,  ..., -2.3442, -2.3435, -2.3442]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[ 0.0012, -0.0006,  0.0006,  ..., -0.0002,  0.0013, -0.0006]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[ 0.0012, -0.0006,  0.0006,  ..., -0.0002,  0.0013, -0.0006]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 4.4543,  5.5360, 10.5822,  ..., -2.5854, -2.5850, -2.5851]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[ 0.0032, -0.0039, -0.0035,  ..., -0.0044,  0.0006,  0.0049]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[ 0.0032, -0.0039, -0.0035,  ..., -0.0044,  0.0006,  0.0049]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 3.4849,  2.5870,  3.0230,  ..., -1.4589, -1.4583, -1.4581]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0018,  0.0068,  0.0076,  ..., -0.0006, -0.0050, -0.0035]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0018,  0.0068,  0.0076,  ..., -0.0006, -0.0050, -0.0035]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[-0.6099, -4.4040,  0.9232,  ...,  0.1388,  0.1387,  0.1388]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0050,  0.0156, -0.0234,  ...,  0.0087, -0.0071,  0.0013]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0050,  0.0156, -0.0234,  ...,  0.0087, -0.0071,  0.0013]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 0.1205,  1.9262,  1.8272,  ..., -1.6031, -1.6030, -1.6035]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[ 0.0027, -0.0110, -0.0009,  ..., -0.0010, -0.0021, -0.0044]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[ 0.0027, -0.0110, -0.0009,  ..., -0.0010, -0.0021, -0.0044]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 2.0034,  2.5199,  1.3652,  ..., -1.9592, -1.9590, -1.9597]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0023,  0.0059, -0.0074,  ..., -0.0142, -0.0020, -0.0064]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0023,  0.0059, -0.0074,  ..., -0.0142, -0.0020, -0.0064]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 3.4204,  0.7696,  1.5233,  ..., -1.0070, -1.0064, -1.0064]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0119, -0.0023, -0.0126,  ..., -0.0117, -0.0003,  0.0177]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0119, -0.0023, -0.0126,  ..., -0.0117, -0.0003,  0.0177]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 0.9769,  0.4621,  0.0646,  ..., -2.0527, -2.0520, -2.0522]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0008, -0.0077,  0.0013,  ...,  0.0006, -0.0118, -0.0026]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0008, -0.0077,  0.0013,  ...,  0.0006, -0.0118, -0.0026]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[-1.5167, -2.4095,  4.0113,  ...,  0.4679,  0.4686,  0.4687]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0113,  0.0221, -0.0044,  ..., -0.0312, -0.0042, -0.0135]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0113,  0.0221, -0.0044,  ..., -0.0312, -0.0042, -0.0135]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 7.7175,  0.3446, -0.0167,  ..., -1.2109, -1.2103, -1.2105]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0018,  0.0068,  0.0076,  ..., -0.0006, -0.0050, -0.0035]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0018,  0.0068,  0.0076,  ..., -0.0006, -0.0050, -0.0035]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[-1.5951, -5.4225, -3.7345,  ..., -1.0607, -1.0593, -1.0597]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0099, -0.0087,  0.0162,  ...,  0.0126,  0.0192, -0.0121]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0099, -0.0087,  0.0162,  ...,  0.0126,  0.0192, -0.0121]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 1.0541, -1.9950,  1.6296,  ...,  2.7531,  2.7529,  2.7528]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0214, -0.0051, -0.0049,  ...,  0.0039,  0.0325,  0.0010]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0214, -0.0051, -0.0049,  ...,  0.0039,  0.0325,  0.0010]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 0.8740, -1.0866,  3.7133,  ...,  0.7347,  0.7341,  0.7336]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0034,  0.0038, -0.0043,  ..., -0.0071,  0.0030, -0.0153]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0034,  0.0038, -0.0043,  ..., -0.0071,  0.0030, -0.0153]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 0.8324, -1.2917, -1.9766,  ..., -0.7884, -0.7868, -0.7882]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0008, -0.0077,  0.0013,  ...,  0.0006, -0.0118, -0.0026]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0008, -0.0077,  0.0013,  ...,  0.0006, -0.0118, -0.0026]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[2.1346, 2.9151, 5.7393,  ..., 0.6441, 0.6446, 0.6450]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0113,  0.0221, -0.0044,  ..., -0.0312, -0.0042, -0.0135]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0113,  0.0221, -0.0044,  ..., -0.0312, -0.0042, -0.0135]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 3.6980, -0.8688,  2.1842,  ..., -2.0267, -2.0266, -2.0268]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0038, -0.0038, -0.0035,  ..., -0.0061,  0.0021,  0.0010]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0038, -0.0038, -0.0035,  ..., -0.0061,  0.0021,  0.0010]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 4.0610, -3.9665, -1.8194,  ..., -1.0435, -1.0427, -1.0431]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0018,  0.0068,  0.0076,  ..., -0.0006, -0.0050, -0.0035]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0018,  0.0068,  0.0076,  ..., -0.0006, -0.0050, -0.0035]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[-3.1728, -9.4189, -4.4757,  ...,  0.2838,  0.2840,  0.2837]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[ 0.0123, -0.0071, -0.0078,  ...,  0.0280,  0.0082,  0.0076]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[ 0.0123, -0.0071, -0.0078,  ...,  0.0280,  0.0082,  0.0076]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[-1.9835, -0.5888,  5.5906,  ...,  1.9771,  1.9764,  1.9766]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0013,  0.0209,  0.0107,  ...,  0.0001, -0.0162,  0.0054]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0013,  0.0209,  0.0107,  ...,  0.0001, -0.0162,  0.0054]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 4.1893,  5.9305, -1.5514,  ...,  0.8957,  0.8958,  0.8948]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0047, -0.0011,  0.0134,  ..., -0.0056, -0.0227, -0.0231]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0047, -0.0011,  0.0134,  ..., -0.0056, -0.0227, -0.0231]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 0.0594,  5.3235,  0.9858,  ..., -0.2280, -0.2277, -0.2280]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[ 6.5327e-05, -3.0060e-03, -7.2937e-03,  ...,  7.8735e-03,\n",
      "          -7.4208e-06,  2.8229e-03]]], device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[ 6.5327e-05, -3.0060e-03, -7.2937e-03,  ...,  7.8735e-03,\n",
      "          -7.4208e-06,  2.8229e-03]]], device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[-4.4447, -7.1798, -2.1469,  ...,  1.3249,  1.3259,  1.3248]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0032, -0.0012, -0.0282,  ..., -0.0007,  0.0082, -0.0076]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0032, -0.0012, -0.0282,  ..., -0.0007,  0.0082, -0.0076]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 3.1925,  7.5137, -1.4380,  ..., -1.3681, -1.3679, -1.3685]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[ 0.0009,  0.0052, -0.0075,  ..., -0.0007, -0.0165,  0.0054]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[ 0.0009,  0.0052, -0.0075,  ..., -0.0007, -0.0165,  0.0054]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 0.1869,  7.4950, -1.0499,  ..., -2.0301, -2.0299, -2.0307]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[ 6.5327e-05, -3.0060e-03, -7.2937e-03,  ...,  7.8735e-03,\n",
      "          -7.4208e-06,  2.8229e-03]]], device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[ 6.5327e-05, -3.0060e-03, -7.2937e-03,  ...,  7.8735e-03,\n",
      "          -7.4208e-06,  2.8229e-03]]], device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[-2.2059, -5.9487, -1.4194,  ...,  1.9609,  1.9607,  1.9604]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0187,  0.0025, -0.0248,  ...,  0.0033, -0.0015,  0.0059]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0187,  0.0025, -0.0248,  ...,  0.0033, -0.0015,  0.0059]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 0.3181,  2.1855, -0.2673,  ...,  2.1398,  2.1397,  2.1388]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0047, -0.0011,  0.0134,  ..., -0.0056, -0.0227, -0.0231]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0047, -0.0011,  0.0134,  ..., -0.0056, -0.0227, -0.0231]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[-2.9254,  4.2009,  0.6930,  ...,  1.3047,  1.3048,  1.3044]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[ 6.5327e-05, -3.0060e-03, -7.2937e-03,  ...,  7.8735e-03,\n",
      "          -7.4208e-06,  2.8229e-03]]], device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[ 6.5327e-05, -3.0060e-03, -7.2937e-03,  ...,  7.8735e-03,\n",
      "          -7.4208e-06,  2.8229e-03]]], device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[-2.4280, -2.5302,  3.9338,  ...,  1.5977,  1.5980,  1.5976]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[ 0.0015,  0.0138, -0.0315,  ...,  0.0110, -0.0102,  0.0093]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[ 0.0015,  0.0138, -0.0315,  ...,  0.0110, -0.0102,  0.0093]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 1.2573, -0.7269,  0.1401,  ..., -0.7753, -0.7754, -0.7754]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[ 0.0032, -0.0039, -0.0035,  ..., -0.0044,  0.0006,  0.0049]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[ 0.0032, -0.0039, -0.0035,  ..., -0.0044,  0.0006,  0.0049]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[-2.0096,  6.5689,  3.1045,  ...,  0.0617,  0.0614,  0.0612]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[ 0.0009,  0.0052, -0.0075,  ..., -0.0007, -0.0165,  0.0054]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[ 0.0009,  0.0052, -0.0075,  ..., -0.0007, -0.0165,  0.0054]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[-0.5081,  8.0743, -3.3691,  ..., -0.8022, -0.8020, -0.8021]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[ 6.5327e-05, -3.0060e-03, -7.2937e-03,  ...,  7.8735e-03,\n",
      "          -7.4208e-06,  2.8229e-03]]], device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[ 6.5327e-05, -3.0060e-03, -7.2937e-03,  ...,  7.8735e-03,\n",
      "          -7.4208e-06,  2.8229e-03]]], device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[-1.3255, -4.4397,  1.4442,  ...,  1.8905,  1.8896,  1.8900]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0028,  0.0109,  0.0155,  ..., -0.0022, -0.0015,  0.0043]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0028,  0.0109,  0.0155,  ..., -0.0022, -0.0015,  0.0043]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[-0.7291, 14.5658,  0.6573,  ...,  0.8771,  0.8763,  0.8766]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0047, -0.0011,  0.0134,  ..., -0.0056, -0.0227, -0.0231]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0047, -0.0011,  0.0134,  ..., -0.0056, -0.0227, -0.0231]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[-3.6418,  4.4336,  1.9554,  ...,  0.9325,  0.9328,  0.9330]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[ 6.5327e-05, -3.0060e-03, -7.2937e-03,  ...,  7.8735e-03,\n",
      "          -7.4208e-06,  2.8229e-03]]], device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[ 6.5327e-05, -3.0060e-03, -7.2937e-03,  ...,  7.8735e-03,\n",
      "          -7.4208e-06,  2.8229e-03]]], device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[-3.9466, -8.3290, -3.9492,  ...,  0.4338,  0.4342,  0.4337]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[ 2.4567e-03,  4.6692e-03, -1.2695e-02,  ...,  2.8163e-06,\n",
      "          -4.5471e-03,  8.9111e-03]]], device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[ 2.4567e-03,  4.6692e-03, -1.2695e-02,  ...,  2.8163e-06,\n",
      "          -4.5471e-03,  8.9111e-03]]], device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 5.9396,  7.1487,  1.2435,  ..., -0.0705, -0.0698, -0.0702]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0093, -0.0001,  0.0025,  ..., -0.0083, -0.0006,  0.0138]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0093, -0.0001,  0.0025,  ..., -0.0083, -0.0006,  0.0138]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 7.5412,  7.5700,  0.5203,  ..., -1.4171, -1.4170, -1.4175]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0035, -0.0209, -0.0248,  ..., -0.0223,  0.0069, -0.0127]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0035, -0.0209, -0.0248,  ..., -0.0223,  0.0069, -0.0127]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 1.2330,  0.7541,  5.9979,  ..., -0.5614, -0.5596, -0.5596]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[ 0.0067,  0.0127,  0.0075,  ...,  0.0030, -0.0160, -0.0062]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[ 0.0067,  0.0127,  0.0075,  ...,  0.0030, -0.0160, -0.0062]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 2.4387, -1.8750, -1.0399,  ..., -1.9745, -1.9736, -1.9744]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0018,  0.0068,  0.0076,  ..., -0.0006, -0.0050, -0.0035]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0018,  0.0068,  0.0076,  ..., -0.0006, -0.0050, -0.0035]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[-2.9822, -7.1292, -2.0205,  ...,  0.2056,  0.2056,  0.2057]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0038, -0.0052, -0.0262,  ...,  0.0052, -0.0012, -0.0031]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0038, -0.0052, -0.0262,  ...,  0.0052, -0.0012, -0.0031]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 1.7589,  1.3610,  1.3912,  ..., -1.5507, -1.5498, -1.5514]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0014, -0.0012, -0.0172,  ..., -0.0068, -0.0176, -0.0036]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0014, -0.0012, -0.0172,  ..., -0.0068, -0.0176, -0.0036]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[-0.9875, -2.4957, -3.1603,  ..., -1.8431, -1.8425, -1.8430]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0099, -0.0087,  0.0162,  ...,  0.0126,  0.0192, -0.0121]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0099, -0.0087,  0.0162,  ...,  0.0126,  0.0192, -0.0121]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[-2.0917, -2.3463,  0.8627,  ...,  3.5735,  3.5729,  3.5730]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0214, -0.0051, -0.0049,  ...,  0.0039,  0.0325,  0.0010]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0214, -0.0051, -0.0049,  ...,  0.0039,  0.0325,  0.0010]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 2.7085,  2.2257,  3.1996,  ..., -0.9459, -0.9459, -0.9463]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0034,  0.0038, -0.0043,  ..., -0.0071,  0.0030, -0.0153]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0034,  0.0038, -0.0043,  ..., -0.0071,  0.0030, -0.0153]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 2.9164, -4.1734, -2.0961,  ..., -0.4731, -0.4724, -0.4733]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0033,  0.0215, -0.0002,  ..., -0.0026, -0.0072,  0.0089]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0033,  0.0215, -0.0002,  ..., -0.0026, -0.0072,  0.0089]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[-2.0231, -1.2131, -1.5379,  ..., -0.5193, -0.5186, -0.5198]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0010, -0.0075, -0.0001,  ...,  0.0001, -0.0011, -0.0019]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0010, -0.0075, -0.0001,  ...,  0.0001, -0.0011, -0.0019]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 2.5671,  0.6028,  0.6565,  ..., -2.9617, -2.9612, -2.9623]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0003, -0.0167,  0.0054,  ..., -0.0063,  0.0168, -0.0089]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0003, -0.0167,  0.0054,  ..., -0.0063,  0.0168, -0.0089]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[-2.9811, -2.6383, -2.2409,  ..., -0.1538, -0.1534, -0.1533]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0043, -0.0212,  0.0034,  ...,  0.0007,  0.0024,  0.0067]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0043, -0.0212,  0.0034,  ...,  0.0007,  0.0024,  0.0067]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 1.2077, -4.2061, -2.5550,  ..., -1.1109, -1.1103, -1.1108]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-2.3007e-05,  5.9891e-04, -5.3787e-04,  ...,  4.5776e-04,\n",
      "           5.6839e-04, -2.7275e-04]]], device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-2.3007e-05,  5.9891e-04, -5.3787e-04,  ...,  4.5776e-04,\n",
      "           5.6839e-04, -2.7275e-04]]], device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[-1.5544, -3.6029, -0.9975,  ..., -0.7386, -0.7386, -0.7391]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[ 0.0053,  0.0085, -0.0011,  ...,  0.0109,  0.0017, -0.0053]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[ 0.0053,  0.0085, -0.0011,  ...,  0.0109,  0.0017, -0.0053]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 4.7994,  2.7957,  3.5508,  ..., -1.8706, -1.8699, -1.8698]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[ 0.0071,  0.0157,  0.0038,  ...,  0.0063,  0.0005, -0.0036]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[ 0.0071,  0.0157,  0.0038,  ...,  0.0063,  0.0005, -0.0036]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 3.1607, -2.5416, -1.9401,  ..., -2.2790, -2.2785, -2.2785]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[ 0.0006,  0.0131,  0.0123,  ..., -0.0098,  0.0084,  0.0076]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[ 0.0006,  0.0131,  0.0123,  ..., -0.0098,  0.0084,  0.0076]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[-0.6444,  0.8783, -0.6482,  ..., -2.0336, -2.0326, -2.0335]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[ 0.0140, -0.0036,  0.0131,  ..., -0.0157,  0.0089, -0.0093]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[ 0.0140, -0.0036,  0.0131,  ..., -0.0157,  0.0089, -0.0093]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 1.5079, -1.1467,  2.4725,  ..., -2.3392, -2.3386, -2.3387]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0028,  0.0227, -0.0025,  ..., -0.0023, -0.0020, -0.0031]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0028,  0.0227, -0.0025,  ..., -0.0023, -0.0020, -0.0031]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 4.0357, -1.3614, -1.9151,  ..., -2.0196, -2.0184, -2.0186]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0124, -0.0024,  0.0277,  ..., -0.0075, -0.0117, -0.0106]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0124, -0.0024,  0.0277,  ..., -0.0075, -0.0117, -0.0106]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 1.7580,  2.0423, -0.3424,  ..., -0.8082, -0.8071, -0.8080]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[ 0.0036, -0.0010, -0.0231,  ...,  0.0099,  0.0045, -0.0049]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[ 0.0036, -0.0010, -0.0231,  ...,  0.0099,  0.0045, -0.0049]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 3.1586,  2.1668,  0.7074,  ..., -1.0520, -1.0506, -1.0520]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0021, -0.0007,  0.0070,  ..., -0.0084, -0.0083, -0.0017]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0021, -0.0007,  0.0070,  ..., -0.0084, -0.0083, -0.0017]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 0.3720,  2.2316, -0.6996,  ..., -1.9173, -1.9168, -1.9169]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0119, -0.0023, -0.0126,  ..., -0.0117, -0.0003,  0.0177]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0119, -0.0023, -0.0126,  ..., -0.0117, -0.0003,  0.0177]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 1.1942,  1.4668, -0.9189,  ..., -2.0954, -2.0946, -2.0956]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0031,  0.0244,  0.0086,  ..., -0.0096, -0.0003,  0.0010]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0031,  0.0244,  0.0086,  ..., -0.0096, -0.0003,  0.0010]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 0.6506,  0.6830, -1.7550,  ..., -1.3549, -1.3537, -1.3547]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0074,  0.0080,  0.0114,  ..., -0.0024, -0.0217,  0.0145]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0074,  0.0080,  0.0114,  ..., -0.0024, -0.0217,  0.0145]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 2.7167, -1.9204, -2.9044,  ..., -1.6527, -1.6526, -1.6526]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[ 0.0015, -0.0030,  0.0022,  ...,  0.0002, -0.0110,  0.0003]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[ 0.0015, -0.0030,  0.0022,  ...,  0.0002, -0.0110,  0.0003]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[-2.7033, -5.2305, -1.1888,  ..., -1.9244, -1.9236, -1.9244]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[ 0.0152,  0.0198,  0.0049,  ...,  0.0122, -0.0074,  0.0068]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[ 0.0152,  0.0198,  0.0049,  ...,  0.0122, -0.0074,  0.0068]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 3.6288,  1.3090, -0.4249,  ..., -3.0354, -3.0352, -3.0351]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-2.3007e-05,  5.9891e-04, -5.3787e-04,  ...,  4.5776e-04,\n",
      "           5.6839e-04, -2.7275e-04]]], device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-2.3007e-05,  5.9891e-04, -5.3787e-04,  ...,  4.5776e-04,\n",
      "           5.6839e-04, -2.7275e-04]]], device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[-2.7025, -3.5614, -4.7877,  ..., -1.3447, -1.3446, -1.3459]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0010, -0.0075, -0.0001,  ...,  0.0001, -0.0011, -0.0019]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0010, -0.0075, -0.0001,  ...,  0.0001, -0.0011, -0.0019]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 1.7389, -0.3433,  0.2352,  ..., -1.5533, -1.5532, -1.5534]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0119, -0.0023, -0.0126,  ..., -0.0117, -0.0003,  0.0177]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0119, -0.0023, -0.0126,  ..., -0.0117, -0.0003,  0.0177]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 2.6118,  1.9146, -2.2677,  ..., -1.5224, -1.5219, -1.5221]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[ 0.0071,  0.0157,  0.0038,  ...,  0.0063,  0.0005, -0.0036]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[ 0.0071,  0.0157,  0.0038,  ...,  0.0063,  0.0005, -0.0036]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 3.0538,  0.1847, -0.0066,  ..., -1.2810, -1.2807, -1.2814]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0116,  0.0066, -0.0007,  ..., -0.0010, -0.0170,  0.0112]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0116,  0.0066, -0.0007,  ..., -0.0010, -0.0170,  0.0112]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 0.5418, -0.5133, -2.0605,  ..., -1.6131, -1.6124, -1.6129]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0033, -0.0028,  0.0010,  ..., -0.0007, -0.0136,  0.0014]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0033, -0.0028,  0.0010,  ..., -0.0007, -0.0136,  0.0014]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 1.0946,  2.1488, -2.5894,  ..., -1.0617, -1.0614, -1.0617]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0010, -0.0075, -0.0001,  ...,  0.0001, -0.0011, -0.0019]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0010, -0.0075, -0.0001,  ...,  0.0001, -0.0011, -0.0019]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 1.1083, -1.4027, -0.0926,  ..., -1.7811, -1.7810, -1.7805]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0029, -0.0006, -0.0219,  ..., -0.0032,  0.0011,  0.0075]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0029, -0.0006, -0.0219,  ..., -0.0032,  0.0011,  0.0075]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 2.4851,  1.3151,  2.2046,  ..., -1.4687, -1.4678, -1.4687]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0034,  0.0038, -0.0043,  ..., -0.0071,  0.0030, -0.0153]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0034,  0.0038, -0.0043,  ..., -0.0071,  0.0030, -0.0153]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 1.7209,  1.3604,  1.6992,  ..., -1.0811, -1.0805, -1.0813]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[ 7.8201e-05, -9.9487e-03, -1.1368e-03,  ..., -4.2534e-04,\n",
      "          -8.6670e-03,  7.5531e-04]]], device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[ 7.8201e-05, -9.9487e-03, -1.1368e-03,  ..., -4.2534e-04,\n",
      "          -8.6670e-03,  7.5531e-04]]], device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 3.7127,  1.9661,  5.3323,  ..., -1.2006, -1.1996, -1.1994]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0186,  0.0043,  0.0227,  ...,  0.0088, -0.0049,  0.0025]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0186,  0.0043,  0.0227,  ...,  0.0088, -0.0049,  0.0025]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[1.6295, 4.9053, 6.9591,  ..., 0.7427, 0.7442, 0.7442]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[ 0.0112, -0.0073,  0.0118,  ..., -0.0182,  0.0040,  0.0034]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[ 0.0112, -0.0073,  0.0118,  ..., -0.0182,  0.0040,  0.0034]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[-1.2234,  2.8452,  2.4710,  ...,  0.3933,  0.3946,  0.3943]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0195, -0.0029,  0.0133,  ..., -0.0145,  0.0056,  0.0117]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0195, -0.0029,  0.0133,  ..., -0.0145,  0.0056,  0.0117]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 0.7097, -0.2919,  2.7117,  ..., -1.8346, -1.8338, -1.8346]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0093, -0.0107,  0.0162,  ..., -0.0099,  0.0160,  0.0139]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0093, -0.0107,  0.0162,  ..., -0.0099,  0.0160,  0.0139]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[-0.5411, -1.1590,  5.3431,  ..., -2.7299, -2.7287, -2.7304]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[ 0.0012, -0.0006,  0.0006,  ..., -0.0002,  0.0013, -0.0006]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[ 0.0012, -0.0006,  0.0006,  ..., -0.0002,  0.0013, -0.0006]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 1.5263e-03,  2.7761e+00,  2.9797e+00,  ..., -2.1910e+00,\n",
      "          -2.1894e+00, -2.1901e+00]]], device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[ 0.0032, -0.0039, -0.0035,  ..., -0.0044,  0.0006,  0.0049]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[ 0.0032, -0.0039, -0.0035,  ..., -0.0044,  0.0006,  0.0049]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 2.0052, -2.5844,  1.9069,  ..., -2.1225, -2.1221, -2.1225]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[ 0.0015, -0.0030,  0.0022,  ...,  0.0002, -0.0110,  0.0003]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[ 0.0015, -0.0030,  0.0022,  ...,  0.0002, -0.0110,  0.0003]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 0.0586, -6.4708,  1.5275,  ..., -0.8475, -0.8479, -0.8481]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[ 0.0032, -0.0039, -0.0035,  ..., -0.0044,  0.0006,  0.0049]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[ 0.0032, -0.0039, -0.0035,  ..., -0.0044,  0.0006,  0.0049]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 4.0138,  0.3866,  0.0886,  ..., -0.7036, -0.7033, -0.7035]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0093,  0.0282, -0.0019,  ..., -0.0020, -0.0159,  0.0023]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0093,  0.0282, -0.0019,  ..., -0.0020, -0.0159,  0.0023]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[1.0246, 1.9234, 4.0165,  ..., 2.2692, 2.2693, 2.2690]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0223,  0.0052,  0.0098,  ..., -0.0064, -0.0166,  0.0056]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0223,  0.0052,  0.0098,  ..., -0.0064, -0.0166,  0.0056]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[-0.3331, -2.7439,  0.0976,  ..., -1.3493, -1.3491, -1.3491]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0063,  0.0005, -0.0061,  ...,  0.0058,  0.0203,  0.0034]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0063,  0.0005, -0.0061,  ...,  0.0058,  0.0203,  0.0034]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 2.6559,  3.5249,  5.2717,  ..., -0.0967, -0.0954, -0.0960]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[ 0.0013,  0.0151, -0.0018,  ...,  0.0022,  0.0057, -0.0040]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[ 0.0013,  0.0151, -0.0018,  ...,  0.0022,  0.0057, -0.0040]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 0.5113, -0.5395,  2.2768,  ..., -2.2552, -2.2549, -2.2555]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0030, -0.0137, -0.0044,  ..., -0.0031, -0.0102,  0.0090]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0030, -0.0137, -0.0044,  ..., -0.0031, -0.0102,  0.0090]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 0.5734,  0.8078, -1.6174,  ..., -1.6196, -1.6188, -1.6205]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0010, -0.0075, -0.0001,  ...,  0.0001, -0.0011, -0.0019]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0010, -0.0075, -0.0001,  ...,  0.0001, -0.0011, -0.0019]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[-0.1237,  2.2872,  1.2362,  ..., -2.7545, -2.7537, -2.7542]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0119, -0.0023, -0.0126,  ..., -0.0117, -0.0003,  0.0177]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0119, -0.0023, -0.0126,  ..., -0.0117, -0.0003,  0.0177]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 3.3226,  2.4809,  0.4301,  ..., -1.7120, -1.7113, -1.7122]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[ 0.0036,  0.0223, -0.0112,  ...,  0.0092, -0.0005,  0.0037]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[ 0.0036,  0.0223, -0.0112,  ...,  0.0092, -0.0005,  0.0037]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[-0.7190, -0.7898, -1.3163,  ..., -0.8909, -0.8900, -0.8909]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0008, -0.0077,  0.0013,  ...,  0.0006, -0.0118, -0.0026]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0008, -0.0077,  0.0013,  ...,  0.0006, -0.0118, -0.0026]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[4.3945, 2.9410, 8.2311,  ..., 0.2492, 0.2502, 0.2505]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0113,  0.0221, -0.0044,  ..., -0.0312, -0.0042, -0.0135]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0113,  0.0221, -0.0044,  ..., -0.0312, -0.0042, -0.0135]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 5.5108, -0.8515,  2.6811,  ..., -1.6113, -1.6104, -1.6109]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0018,  0.0068,  0.0076,  ..., -0.0006, -0.0050, -0.0035]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0018,  0.0068,  0.0076,  ..., -0.0006, -0.0050, -0.0035]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[-1.5121, -3.6095, -1.5243,  ..., -1.2095, -1.2087, -1.2094]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0022,  0.0114,  0.0063,  ..., -0.0049,  0.0018,  0.0047]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0022,  0.0114,  0.0063,  ..., -0.0049,  0.0018,  0.0047]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[-4.4707, -2.5356,  2.7678,  ...,  5.6256,  5.6249,  5.6255]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0214, -0.0051, -0.0049,  ...,  0.0039,  0.0325,  0.0010]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0214, -0.0051, -0.0049,  ...,  0.0039,  0.0325,  0.0010]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[-0.4477, -0.9632,  1.7093,  ...,  0.9068,  0.9065,  0.9059]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0034,  0.0038, -0.0043,  ..., -0.0071,  0.0030, -0.0153]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0034,  0.0038, -0.0043,  ..., -0.0071,  0.0030, -0.0153]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[-2.2515,  3.4878,  1.0475,  ..., -3.0656, -3.0651, -3.0650]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0008, -0.0077,  0.0013,  ...,  0.0006, -0.0118, -0.0026]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0008, -0.0077,  0.0013,  ...,  0.0006, -0.0118, -0.0026]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 2.3882,  3.6918,  7.2423,  ..., -0.6630, -0.6625, -0.6620]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0113,  0.0221, -0.0044,  ..., -0.0312, -0.0042, -0.0135]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0113,  0.0221, -0.0044,  ..., -0.0312, -0.0042, -0.0135]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 3.4627,  1.4199,  2.8800,  ..., -3.5673, -3.5674, -3.5671]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0038, -0.0038, -0.0035,  ..., -0.0061,  0.0021,  0.0010]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0038, -0.0038, -0.0035,  ..., -0.0061,  0.0021,  0.0010]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 8.3884, -0.8113,  2.9059,  ..., -1.3511, -1.3511, -1.3513]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0018,  0.0068,  0.0076,  ..., -0.0006, -0.0050, -0.0035]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0018,  0.0068,  0.0076,  ..., -0.0006, -0.0050, -0.0035]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[-2.0293, -5.4142, -3.2402,  ..., -0.7023, -0.7029, -0.7030]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[ 0.0123, -0.0071, -0.0078,  ...,  0.0280,  0.0082,  0.0076]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[ 0.0123, -0.0071, -0.0078,  ...,  0.0280,  0.0082,  0.0076]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[-1.3591, -3.9261,  4.2697,  ...,  2.2377,  2.2366,  2.2365]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0013,  0.0209,  0.0107,  ...,  0.0001, -0.0162,  0.0054]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0013,  0.0209,  0.0107,  ...,  0.0001, -0.0162,  0.0054]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 5.3172,  7.5477,  2.0222,  ..., -0.1582, -0.1582, -0.1591]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0047, -0.0011,  0.0134,  ..., -0.0056, -0.0227, -0.0231]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0047, -0.0011,  0.0134,  ..., -0.0056, -0.0227, -0.0231]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 2.1456,  6.1462,  0.7585,  ..., -1.5087, -1.5084, -1.5093]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[ 6.5327e-05, -3.0060e-03, -7.2937e-03,  ...,  7.8735e-03,\n",
      "          -7.4208e-06,  2.8229e-03]]], device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[ 6.5327e-05, -3.0060e-03, -7.2937e-03,  ...,  7.8735e-03,\n",
      "          -7.4208e-06,  2.8229e-03]]], device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[-2.4931, -4.4966, -0.7325,  ...,  2.1651,  2.1660,  2.1653]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0032, -0.0012, -0.0282,  ..., -0.0007,  0.0082, -0.0076]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0032, -0.0012, -0.0282,  ..., -0.0007,  0.0082, -0.0076]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 0.3028, 12.1057,  0.2260,  ..., -0.6559, -0.6560, -0.6563]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0035, -0.0209, -0.0248,  ..., -0.0223,  0.0069, -0.0127]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0035, -0.0209, -0.0248,  ..., -0.0223,  0.0069, -0.0127]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[-1.3868, -1.7872,  1.9420,  ...,  1.5993,  1.6007,  1.6006]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[ 0.0067,  0.0127,  0.0075,  ...,  0.0030, -0.0160, -0.0062]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[ 0.0067,  0.0127,  0.0075,  ...,  0.0030, -0.0160, -0.0062]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 3.8082, -2.9429,  1.0673,  ..., -1.7794, -1.7791, -1.7795]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0018,  0.0068,  0.0076,  ..., -0.0006, -0.0050, -0.0035]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0018,  0.0068,  0.0076,  ..., -0.0006, -0.0050, -0.0035]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[-2.7408, -4.7217, -1.8763,  ..., -1.5585, -1.5580, -1.5585]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[ 0.0029, -0.0177, -0.0141,  ...,  0.0016, -0.0153,  0.0040]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[ 0.0029, -0.0177, -0.0141,  ...,  0.0016, -0.0153,  0.0040]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 0.9456,  1.5991,  0.6526,  ..., -1.1952, -1.1943, -1.1957]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0010, -0.0075, -0.0001,  ...,  0.0001, -0.0011, -0.0019]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0010, -0.0075, -0.0001,  ...,  0.0001, -0.0011, -0.0019]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 0.9732,  3.8609, -0.1656,  ..., -1.6304, -1.6297, -1.6301]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0119, -0.0023, -0.0126,  ..., -0.0117, -0.0003,  0.0177]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0119, -0.0023, -0.0126,  ..., -0.0117, -0.0003,  0.0177]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 3.0709,  6.6360, -2.6514,  ..., -1.3113, -1.3105, -1.3115]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[ 0.0036,  0.0223, -0.0112,  ...,  0.0092, -0.0005,  0.0037]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[ 0.0036,  0.0223, -0.0112,  ...,  0.0092, -0.0005,  0.0037]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 3.2206,  2.4472, -1.9566,  ..., -1.2133, -1.2132, -1.2142]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-2.3007e-05,  5.9891e-04, -5.3787e-04,  ...,  4.5776e-04,\n",
      "           5.6839e-04, -2.7275e-04]]], device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-2.3007e-05,  5.9891e-04, -5.3787e-04,  ...,  4.5776e-04,\n",
      "           5.6839e-04, -2.7275e-04]]], device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 3.5176, -0.9849,  0.3258,  ..., -0.7949, -0.7943, -0.7946]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0010, -0.0075, -0.0001,  ...,  0.0001, -0.0011, -0.0019]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0010, -0.0075, -0.0001,  ...,  0.0001, -0.0011, -0.0019]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 1.5004,  2.6596, -1.1331,  ..., -1.6462, -1.6451, -1.6453]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0042, -0.0126,  0.0064,  ..., -0.0089,  0.0113, -0.0028]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0042, -0.0126,  0.0064,  ..., -0.0089,  0.0113, -0.0028]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 1.3725,  0.2594, -0.6799,  ..., -2.3497, -2.3498, -2.3498]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[ 0.0071,  0.0157,  0.0038,  ...,  0.0063,  0.0005, -0.0036]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[ 0.0071,  0.0157,  0.0038,  ...,  0.0063,  0.0005, -0.0036]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 1.4679,  1.8144, -1.9817,  ..., -1.5515, -1.5499, -1.5511]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0004, -0.0193, -0.0104,  ...,  0.0118,  0.0016,  0.0117]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0004, -0.0193, -0.0104,  ...,  0.0118,  0.0016,  0.0117]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[-1.7208, -2.1970, -5.0860,  ..., -0.7863, -0.7854, -0.7858]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[ 0.0027, -0.0110, -0.0009,  ..., -0.0010, -0.0021, -0.0044]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[ 0.0027, -0.0110, -0.0009,  ..., -0.0010, -0.0021, -0.0044]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 0.8722,  3.4382, -2.5546,  ..., -1.7179, -1.7178, -1.7180]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0081,  0.0101,  0.0019,  ..., -0.0071, -0.0009,  0.0015]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0081,  0.0101,  0.0019,  ..., -0.0071, -0.0009,  0.0015]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[2.7599, 2.2939, 1.3147,  ..., 0.3591, 0.3597, 0.3598]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0170,  0.0295, -0.0204,  ..., -0.0018,  0.0102, -0.0142]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0170,  0.0295, -0.0204,  ..., -0.0018,  0.0102, -0.0142]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 1.2763,  1.7812,  1.3218,  ..., -1.4423, -1.4419, -1.4413]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[ 0.0037,  0.0040,  0.0041,  ..., -0.0101,  0.0089,  0.0243]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[ 0.0037,  0.0040,  0.0041,  ..., -0.0101,  0.0089,  0.0243]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 4.9206,  1.9246, -2.1905,  ..., -2.3396, -2.3392, -2.3392]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[ 0.0054,  0.0061,  0.0085,  ..., -0.0032,  0.0020, -0.0164]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[ 0.0054,  0.0061,  0.0085,  ..., -0.0032,  0.0020, -0.0164]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[-0.5286, -5.5253, -0.2846,  ..., -0.6745, -0.6743, -0.6751]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0075, -0.0050,  0.0016,  ...,  0.0295, -0.0137,  0.0021]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0075, -0.0050,  0.0016,  ...,  0.0295, -0.0137,  0.0021]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 1.6344, -0.7373, -4.8042,  ..., -2.3493, -2.3480, -2.3489]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-2.3007e-05,  5.9891e-04, -5.3787e-04,  ...,  4.5776e-04,\n",
      "           5.6839e-04, -2.7275e-04]]], device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-2.3007e-05,  5.9891e-04, -5.3787e-04,  ...,  4.5776e-04,\n",
      "           5.6839e-04, -2.7275e-04]]], device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 0.0371, -0.1779, -3.9866,  ..., -1.1455, -1.1447, -1.1456]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[ 0.0040,  0.0074, -0.0010,  ..., -0.0082, -0.0087,  0.0015]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[ 0.0040,  0.0074, -0.0010,  ..., -0.0082, -0.0087,  0.0015]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 0.2521, -0.1333, -1.6064,  ..., -1.0620, -1.0613, -1.0620]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[ 0.0071,  0.0157,  0.0038,  ...,  0.0063,  0.0005, -0.0036]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[ 0.0071,  0.0157,  0.0038,  ...,  0.0063,  0.0005, -0.0036]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 4.2903,  0.4645, -3.0411,  ..., -2.6449, -2.6442, -2.6444]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0004, -0.0193, -0.0104,  ...,  0.0118,  0.0016,  0.0117]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0004, -0.0193, -0.0104,  ...,  0.0118,  0.0016,  0.0117]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[-4.9022, -4.6103, -2.0388,  ..., -0.4406, -0.4408, -0.4403]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[ 0.0101, -0.0058,  0.0087,  ...,  0.0025,  0.0063,  0.0003]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[ 0.0101, -0.0058,  0.0087,  ...,  0.0025,  0.0063,  0.0003]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 1.6558,  1.2308,  0.7264,  ..., -1.8384, -1.8384, -1.8379]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0051, -0.0004, -0.0248,  ..., -0.0130,  0.0147,  0.0036]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0051, -0.0004, -0.0248,  ..., -0.0130,  0.0147,  0.0036]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[-1.0661,  0.3916, -0.5686,  ..., -0.3410, -0.3407, -0.3404]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[ 0.0037,  0.0040,  0.0041,  ..., -0.0101,  0.0089,  0.0243]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[ 0.0037,  0.0040,  0.0041,  ..., -0.0101,  0.0089,  0.0243]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 1.9579, -0.6263, -4.4607,  ..., -1.1866, -1.1857, -1.1863]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[ 7.8201e-05, -9.9487e-03, -1.1368e-03,  ..., -4.2534e-04,\n",
      "          -8.6670e-03,  7.5531e-04]]], device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[ 7.8201e-05, -9.9487e-03, -1.1368e-03,  ..., -4.2534e-04,\n",
      "          -8.6670e-03,  7.5531e-04]]], device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 1.9053,  3.8153,  4.1856,  ..., -1.6572, -1.6563, -1.6560]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0186,  0.0043,  0.0227,  ...,  0.0088, -0.0049,  0.0025]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0186,  0.0043,  0.0227,  ...,  0.0088, -0.0049,  0.0025]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 2.4512, 10.2327, 10.2727,  ..., -1.4624, -1.4608, -1.4608]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[ 0.0112, -0.0073,  0.0118,  ..., -0.0182,  0.0040,  0.0034]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[ 0.0112, -0.0073,  0.0118,  ..., -0.0182,  0.0040,  0.0034]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[-0.3579,  2.7078,  2.8498,  ..., -0.8039, -0.8028, -0.8030]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0195, -0.0029,  0.0133,  ..., -0.0145,  0.0056,  0.0117]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0195, -0.0029,  0.0133,  ..., -0.0145,  0.0056,  0.0117]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 2.4487, -1.7388,  0.6734,  ..., -2.5842, -2.5833, -2.5839]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0093, -0.0107,  0.0162,  ..., -0.0099,  0.0160,  0.0139]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0093, -0.0107,  0.0162,  ..., -0.0099,  0.0160,  0.0139]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 1.3108, -3.4314, -0.4240,  ..., -2.4828, -2.4824, -2.4833]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[ 0.0012, -0.0006,  0.0006,  ..., -0.0002,  0.0013, -0.0006]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[ 0.0012, -0.0006,  0.0006,  ..., -0.0002,  0.0013, -0.0006]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 4.7512,  2.8551,  5.2249,  ..., -4.5689, -4.5679, -4.5680]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[ 0.0032, -0.0039, -0.0035,  ..., -0.0044,  0.0006,  0.0049]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[ 0.0032, -0.0039, -0.0035,  ..., -0.0044,  0.0006,  0.0049]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 3.7771,  1.3997,  4.2881,  ..., -1.6369, -1.6365, -1.6367]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[ 0.0015, -0.0030,  0.0022,  ...,  0.0002, -0.0110,  0.0003]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[ 0.0015, -0.0030,  0.0022,  ...,  0.0002, -0.0110,  0.0003]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[-1.3893, -3.3161, -0.4270,  ..., -0.5903, -0.5900, -0.5896]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[ 1.1396e-04, -1.0925e-02, -4.6692e-03,  ..., -4.0054e-05,\n",
      "           8.4839e-03, -1.3962e-03]]], device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[ 1.1396e-04, -1.0925e-02, -4.6692e-03,  ..., -4.0054e-05,\n",
      "           8.4839e-03, -1.3962e-03]]], device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 3.9743,  0.6660, -0.0290,  ..., -1.9516, -1.9513, -1.9520]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0018,  0.0068,  0.0076,  ..., -0.0006, -0.0050, -0.0035]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0018,  0.0068,  0.0076,  ..., -0.0006, -0.0050, -0.0035]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 0.0455, -2.0126,  0.0682,  ..., -1.5931, -1.5925, -1.5937]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0046,  0.0154, -0.0052,  ..., -0.0084, -0.0023,  0.0017]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0046,  0.0154, -0.0052,  ..., -0.0084, -0.0023,  0.0017]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 0.6990, -1.0435, -2.6874,  ..., -1.9581, -1.9576, -1.9586]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[ 0.0050, -0.0138, -0.0035,  ...,  0.0022,  0.0170,  0.0015]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[ 0.0050, -0.0138, -0.0035,  ...,  0.0022,  0.0170,  0.0015]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 2.4539, -0.1370, -2.6062,  ..., -2.5004, -2.4999, -2.5007]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0119, -0.0023, -0.0126,  ..., -0.0117, -0.0003,  0.0177]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0119, -0.0023, -0.0126,  ..., -0.0117, -0.0003,  0.0177]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 3.4726, -3.8662, -0.6862,  ..., -1.9741, -1.9731, -1.9732]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0063,  0.0005, -0.0061,  ...,  0.0058,  0.0203,  0.0034]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0063,  0.0005, -0.0061,  ...,  0.0058,  0.0203,  0.0034]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[0.5781, 2.9712, 2.6818,  ..., 1.1774, 1.1777, 1.1775]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0113,  0.0221, -0.0044,  ..., -0.0312, -0.0042, -0.0135]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0113,  0.0221, -0.0044,  ..., -0.0312, -0.0042, -0.0135]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 2.9239, -1.8706,  2.3316,  ..., -1.6806, -1.6799, -1.6804]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0018,  0.0068,  0.0076,  ..., -0.0006, -0.0050, -0.0035]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0018,  0.0068,  0.0076,  ..., -0.0006, -0.0050, -0.0035]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[-0.4819, -2.3683, -2.9432,  ..., -3.1436, -3.1436, -3.1437]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0036,  0.0184,  0.0156,  ..., -0.0067,  0.0151,  0.0086]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0036,  0.0184,  0.0156,  ..., -0.0067,  0.0151,  0.0086]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[-6.9702, -4.1871, -2.9567,  ...,  4.0093,  4.0097,  4.0094]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[ 0.0016,  0.0124, -0.0045,  ..., -0.0177,  0.0157, -0.0016]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[ 0.0016,  0.0124, -0.0045,  ..., -0.0177,  0.0157, -0.0016]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 0.4819, -0.1222,  0.9041,  ..., -2.2495, -2.2487, -2.2498]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0008, -0.0077,  0.0013,  ...,  0.0006, -0.0118, -0.0026]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0008, -0.0077,  0.0013,  ...,  0.0006, -0.0118, -0.0026]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 2.1710,  2.7974,  5.8687,  ..., -0.8092, -0.8088, -0.8088]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0113,  0.0221, -0.0044,  ..., -0.0312, -0.0042, -0.0135]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0113,  0.0221, -0.0044,  ..., -0.0312, -0.0042, -0.0135]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 2.8183,  1.2459,  2.3437,  ..., -3.6215, -3.6216, -3.6215]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0038, -0.0038, -0.0035,  ..., -0.0061,  0.0021,  0.0010]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0038, -0.0038, -0.0035,  ..., -0.0061,  0.0021,  0.0010]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 4.8976, -2.7700, -0.9937,  ..., -0.6450, -0.6448, -0.6452]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0018,  0.0068,  0.0076,  ..., -0.0006, -0.0050, -0.0035]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0018,  0.0068,  0.0076,  ..., -0.0006, -0.0050, -0.0035]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[-0.0357, -0.7376, -0.6203,  ..., -1.3933, -1.3933, -1.3935]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[ 0.0123, -0.0071, -0.0078,  ...,  0.0280,  0.0082,  0.0076]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[ 0.0123, -0.0071, -0.0078,  ...,  0.0280,  0.0082,  0.0076]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 2.1083, -0.0326,  6.0396,  ...,  0.6390,  0.6386,  0.6379]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0013,  0.0209,  0.0107,  ...,  0.0001, -0.0162,  0.0054]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0013,  0.0209,  0.0107,  ...,  0.0001, -0.0162,  0.0054]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 2.9416,  9.7112, -0.1328,  ..., -0.5209, -0.5205, -0.5214]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0047, -0.0011,  0.0134,  ..., -0.0056, -0.0227, -0.0231]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0047, -0.0011,  0.0134,  ..., -0.0056, -0.0227, -0.0231]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[-1.0259,  6.5028, -0.2255,  ..., -1.2999, -1.2999, -1.3005]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[ 6.5327e-05, -3.0060e-03, -7.2937e-03,  ...,  7.8735e-03,\n",
      "          -7.4208e-06,  2.8229e-03]]], device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[ 6.5327e-05, -3.0060e-03, -7.2937e-03,  ...,  7.8735e-03,\n",
      "          -7.4208e-06,  2.8229e-03]]], device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[-0.6241, -2.6318,  0.6842,  ...,  2.0682,  2.0692,  2.0684]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0032, -0.0012, -0.0282,  ..., -0.0007,  0.0082, -0.0076]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0032, -0.0012, -0.0282,  ..., -0.0007,  0.0082, -0.0076]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 2.0535,  8.5590, -0.4892,  ..., -0.4010, -0.4010, -0.4017]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[ 0.0009,  0.0052, -0.0075,  ..., -0.0007, -0.0165,  0.0054]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[ 0.0009,  0.0052, -0.0075,  ..., -0.0007, -0.0165,  0.0054]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 1.8543,  3.9463, -3.3922,  ..., -1.5590, -1.5586, -1.5599]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[ 6.5327e-05, -3.0060e-03, -7.2937e-03,  ...,  7.8735e-03,\n",
      "          -7.4208e-06,  2.8229e-03]]], device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[ 6.5327e-05, -3.0060e-03, -7.2937e-03,  ...,  7.8735e-03,\n",
      "          -7.4208e-06,  2.8229e-03]]], device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[-1.8115, -3.9955, -0.3136,  ...,  1.1152,  1.1144,  1.1138]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0187,  0.0025, -0.0248,  ...,  0.0033, -0.0015,  0.0059]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0187,  0.0025, -0.0248,  ...,  0.0033, -0.0015,  0.0059]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[2.3460, 5.7076, 1.1687,  ..., 0.6429, 0.6428, 0.6422]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0047, -0.0011,  0.0134,  ..., -0.0056, -0.0227, -0.0231]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0047, -0.0011,  0.0134,  ..., -0.0056, -0.0227, -0.0231]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[-0.7647, 12.2807,  0.1629,  ...,  0.3709,  0.3711,  0.3702]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[ 6.5327e-05, -3.0060e-03, -7.2937e-03,  ...,  7.8735e-03,\n",
      "          -7.4208e-06,  2.8229e-03]]], device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[ 6.5327e-05, -3.0060e-03, -7.2937e-03,  ...,  7.8735e-03,\n",
      "          -7.4208e-06,  2.8229e-03]]], device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[-0.2943, -4.5870,  1.4900,  ...,  0.8882,  0.8882,  0.8881]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0175, -0.0078,  0.0221,  ...,  0.0123,  0.0413,  0.0099]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0175, -0.0078,  0.0221,  ...,  0.0123,  0.0413,  0.0099]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 1.3831,  1.1685,  3.9708,  ..., -0.8285, -0.8273, -0.8282]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[ 0.0138,  0.0064, -0.0227,  ...,  0.0118,  0.0052,  0.0091]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[ 0.0138,  0.0064, -0.0227,  ...,  0.0118,  0.0052,  0.0091]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 1.8709,  4.5433,  0.6185,  ..., -1.9250, -1.9253, -1.9251]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0041, -0.0078, -0.0234,  ...,  0.0294, -0.0097, -0.0128]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0041, -0.0078, -0.0234,  ...,  0.0294, -0.0097, -0.0128]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[-0.2090,  6.5949,  3.7240,  ..., -1.3317, -1.3317, -1.3310]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[ 0.0076, -0.0053,  0.0005,  ..., -0.0067, -0.0006,  0.0015]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[ 0.0076, -0.0053,  0.0005,  ..., -0.0067, -0.0006,  0.0015]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[-2.9188,  8.6848,  2.9049,  ...,  0.2833,  0.2833,  0.2834]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[ 0.0009,  0.0052, -0.0075,  ..., -0.0007, -0.0165,  0.0054]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[ 0.0009,  0.0052, -0.0075,  ..., -0.0007, -0.0165,  0.0054]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 0.3274, 10.5578, -0.1576,  ..., -1.1242, -1.1239, -1.1246]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[ 6.5327e-05, -3.0060e-03, -7.2937e-03,  ...,  7.8735e-03,\n",
      "          -7.4208e-06,  2.8229e-03]]], device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[ 6.5327e-05, -3.0060e-03, -7.2937e-03,  ...,  7.8735e-03,\n",
      "          -7.4208e-06,  2.8229e-03]]], device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[-1.1783, -4.0782,  0.1071,  ...,  1.6773,  1.6766,  1.6764]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0028,  0.0109,  0.0155,  ..., -0.0022, -0.0015,  0.0043]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0028,  0.0109,  0.0155,  ..., -0.0022, -0.0015,  0.0043]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[4.9678, 7.2635, 2.3780,  ..., 0.6923, 0.6922, 0.6921]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0047, -0.0011,  0.0134,  ..., -0.0056, -0.0227, -0.0231]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0047, -0.0011,  0.0134,  ..., -0.0056, -0.0227, -0.0231]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[-0.8137,  4.4803,  1.8996,  ..., -0.8706, -0.8703, -0.8707]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[ 6.5327e-05, -3.0060e-03, -7.2937e-03,  ...,  7.8735e-03,\n",
      "          -7.4208e-06,  2.8229e-03]]], device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[ 6.5327e-05, -3.0060e-03, -7.2937e-03,  ...,  7.8735e-03,\n",
      "          -7.4208e-06,  2.8229e-03]]], device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 0.2562, -3.7675, -0.4150,  ...,  0.9767,  0.9770,  0.9767]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[ 0.0022,  0.0466, -0.0109,  ...,  0.0036,  0.0116, -0.0139]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[ 0.0022,  0.0466, -0.0109,  ...,  0.0036,  0.0116, -0.0139]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 2.3134,  0.6528,  0.7307,  ..., -1.6017, -1.6017, -1.6025]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0036,  0.0184,  0.0156,  ..., -0.0067,  0.0151,  0.0086]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0036,  0.0184,  0.0156,  ..., -0.0067,  0.0151,  0.0086]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 0.6074,  1.4797, -0.4538,  ..., -0.6367, -0.6364, -0.6372]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[ 3.2806e-03,  4.7445e-05, -6.9580e-03,  ..., -2.2736e-03,\n",
      "          -1.3428e-03,  6.6528e-03]]], device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[ 3.2806e-03,  4.7445e-05, -6.9580e-03,  ..., -2.2736e-03,\n",
      "          -1.3428e-03,  6.6528e-03]]], device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 2.7096, -0.8454, -3.1697,  ..., -0.4315, -0.4304, -0.4312]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[ 0.0036, -0.0010, -0.0231,  ...,  0.0099,  0.0045, -0.0049]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[ 0.0036, -0.0010, -0.0231,  ...,  0.0099,  0.0045, -0.0049]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 2.8151,  9.7638,  0.0560,  ..., -0.7380, -0.7373, -0.7385]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0035, -0.0209, -0.0248,  ..., -0.0223,  0.0069, -0.0127]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0035, -0.0209, -0.0248,  ..., -0.0223,  0.0069, -0.0127]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[-0.1995,  1.4968,  4.4446,  ...,  0.1775,  0.1789,  0.1787]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[ 0.0067,  0.0127,  0.0075,  ...,  0.0030, -0.0160, -0.0062]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[ 0.0067,  0.0127,  0.0075,  ...,  0.0030, -0.0160, -0.0062]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 6.1362,  0.2864,  2.6334,  ..., -1.6718, -1.6712, -1.6719]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0018,  0.0068,  0.0076,  ..., -0.0006, -0.0050, -0.0035]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0018,  0.0068,  0.0076,  ..., -0.0006, -0.0050, -0.0035]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[-1.6068, -5.5754, -1.2902,  ..., -1.7314, -1.7304, -1.7322]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[ 0.0262, -0.0088,  0.0093,  ...,  0.0116,  0.0063, -0.0068]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[ 0.0262, -0.0088,  0.0093,  ...,  0.0116,  0.0063, -0.0068]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 1.8275,  2.6545, -0.7779,  ..., -2.2056, -2.2054, -2.2069]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[ 0.0036, -0.0010, -0.0231,  ...,  0.0099,  0.0045, -0.0049]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[ 0.0036, -0.0010, -0.0231,  ...,  0.0099,  0.0045, -0.0049]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 5.7640,  5.2232,  4.5474,  ..., -1.8150, -1.8142, -1.8153]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0021, -0.0007,  0.0070,  ..., -0.0084, -0.0083, -0.0017]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0021, -0.0007,  0.0070,  ..., -0.0084, -0.0083, -0.0017]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 3.8111,  1.2571,  1.9526,  ..., -1.9218, -1.9215, -1.9224]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0029, -0.0006, -0.0219,  ..., -0.0032,  0.0011,  0.0075]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0029, -0.0006, -0.0219,  ..., -0.0032,  0.0011,  0.0075]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 2.3309, -2.7859, 13.8804,  ..., -1.3599, -1.3589, -1.3596]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[ 0.0203,  0.0012,  0.0066,  ..., -0.0087,  0.0025,  0.0040]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[ 0.0203,  0.0012,  0.0066,  ..., -0.0087,  0.0025,  0.0040]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 4.6407,  1.1192, -1.7604,  ..., -1.5145, -1.5141, -1.5145]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[ 0.0071,  0.0157,  0.0038,  ...,  0.0063,  0.0005, -0.0036]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[ 0.0071,  0.0157,  0.0038,  ...,  0.0063,  0.0005, -0.0036]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 1.3965,  0.5229, -0.6186,  ..., -2.0287, -2.0280, -2.0277]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0074,  0.0080,  0.0114,  ..., -0.0024, -0.0217,  0.0145]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0074,  0.0080,  0.0114,  ..., -0.0024, -0.0217,  0.0145]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 0.8172, -1.1813, -1.8141,  ..., -0.1950, -0.1939, -0.1943]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0033,  0.0122,  0.0004,  ..., -0.0025,  0.0162, -0.0006]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0033,  0.0122,  0.0004,  ..., -0.0025,  0.0162, -0.0006]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 3.3440,  1.0741,  1.3180,  ..., -2.1748, -2.1752, -2.1749]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0010, -0.0075, -0.0001,  ...,  0.0001, -0.0011, -0.0019]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0010, -0.0075, -0.0001,  ...,  0.0001, -0.0011, -0.0019]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 2.6637,  0.9051,  4.6422,  ..., -1.5390, -1.5384, -1.5388]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0046, -0.0117, -0.0055,  ..., -0.0015,  0.0047, -0.0038]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0046, -0.0117, -0.0055,  ..., -0.0015,  0.0047, -0.0038]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 2.4761, -0.3373, -0.9150,  ..., -2.6580, -2.6564, -2.6571]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0030,  0.0056,  0.0034,  ..., -0.0063, -0.0016,  0.0013]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0030,  0.0056,  0.0034,  ..., -0.0063, -0.0016,  0.0013]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 3.3808,  2.6872,  0.5701,  ..., -1.2466, -1.2455, -1.2461]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[ 0.0036, -0.0010, -0.0231,  ...,  0.0099,  0.0045, -0.0049]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[ 0.0036, -0.0010, -0.0231,  ...,  0.0099,  0.0045, -0.0049]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 3.2630,  1.2580, -2.2322,  ..., -2.3452, -2.3450, -2.3454]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0033, -0.0028,  0.0010,  ..., -0.0007, -0.0136,  0.0014]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0033, -0.0028,  0.0010,  ..., -0.0007, -0.0136,  0.0014]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[-0.4661, -0.9020, -2.5675,  ...,  0.4146,  0.4149,  0.4144]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0010, -0.0075, -0.0001,  ...,  0.0001, -0.0011, -0.0019]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0010, -0.0075, -0.0001,  ...,  0.0001, -0.0011, -0.0019]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 2.8883, -0.2316, -0.5896,  ..., -1.3888, -1.3883, -1.3886]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0029, -0.0006, -0.0219,  ..., -0.0032,  0.0011,  0.0075]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0029, -0.0006, -0.0219,  ..., -0.0032,  0.0011,  0.0075]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 3.7277,  1.3783,  2.5376,  ..., -2.0539, -2.0537, -2.0535]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0034,  0.0038, -0.0043,  ..., -0.0071,  0.0030, -0.0153]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0034,  0.0038, -0.0043,  ..., -0.0071,  0.0030, -0.0153]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 1.8609, -0.3408,  1.0878,  ..., -1.5348, -1.5337, -1.5343]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[ 7.8201e-05, -9.9487e-03, -1.1368e-03,  ..., -4.2534e-04,\n",
      "          -8.6670e-03,  7.5531e-04]]], device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[ 7.8201e-05, -9.9487e-03, -1.1368e-03,  ..., -4.2534e-04,\n",
      "          -8.6670e-03,  7.5531e-04]]], device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 0.4972,  5.1805,  3.6910,  ..., -0.6999, -0.6987, -0.6979]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0186,  0.0043,  0.0227,  ...,  0.0088, -0.0049,  0.0025]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0186,  0.0043,  0.0227,  ...,  0.0088, -0.0049,  0.0025]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 2.8252,  9.5029, 10.3238,  ..., -0.6844, -0.6829, -0.6832]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[ 0.0112, -0.0073,  0.0118,  ..., -0.0182,  0.0040,  0.0034]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[ 0.0112, -0.0073,  0.0118,  ..., -0.0182,  0.0040,  0.0034]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 2.1049,  4.7016,  4.3922,  ..., -0.4317, -0.4307, -0.4303]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0195, -0.0029,  0.0133,  ..., -0.0145,  0.0056,  0.0117]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0195, -0.0029,  0.0133,  ..., -0.0145,  0.0056,  0.0117]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 2.5149,  2.9946,  4.3911,  ..., -3.0439, -3.0432, -3.0438]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0093, -0.0107,  0.0162,  ..., -0.0099,  0.0160,  0.0139]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0093, -0.0107,  0.0162,  ..., -0.0099,  0.0160,  0.0139]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 3.0974,  1.9277,  5.0944,  ..., -4.0628, -4.0621, -4.0628]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[ 0.0012, -0.0006,  0.0006,  ..., -0.0002,  0.0013, -0.0006]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[ 0.0012, -0.0006,  0.0006,  ..., -0.0002,  0.0013, -0.0006]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 2.5836,  3.3054,  1.9075,  ..., -3.2141, -3.2134, -3.2133]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[ 0.0032, -0.0039, -0.0035,  ..., -0.0044,  0.0006,  0.0049]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[ 0.0032, -0.0039, -0.0035,  ..., -0.0044,  0.0006,  0.0049]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 7.1533,  2.5894,  5.9803,  ..., -1.6255, -1.6251, -1.6254]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[ 0.0015, -0.0030,  0.0022,  ...,  0.0002, -0.0110,  0.0003]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[ 0.0015, -0.0030,  0.0022,  ...,  0.0002, -0.0110,  0.0003]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 2.1198, -0.2861,  3.2459,  ..., -2.2151, -2.2150, -2.2142]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[ 0.0028,  0.0032, -0.0081,  ..., -0.0072, -0.0020, -0.0025]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[ 0.0028,  0.0032, -0.0081,  ..., -0.0072, -0.0020, -0.0025]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 4.0943,  0.6041,  2.0801,  ..., -3.0354, -3.0350, -3.0357]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0018,  0.0068,  0.0076,  ..., -0.0006, -0.0050, -0.0035]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0018,  0.0068,  0.0076,  ..., -0.0006, -0.0050, -0.0035]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[-2.1807, -3.9768, -0.3748,  ..., -0.9570, -0.9561, -0.9573]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0028,  0.0182, -0.0084,  ...,  0.0337, -0.0041,  0.0007]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0028,  0.0182, -0.0084,  ...,  0.0337, -0.0041,  0.0007]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 0.3068, -0.9960,  0.3759,  ..., -2.8905, -2.8900, -2.8905]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0119, -0.0023, -0.0126,  ..., -0.0117, -0.0003,  0.0177]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0119, -0.0023, -0.0126,  ..., -0.0117, -0.0003,  0.0177]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 0.5487, -5.7319, -2.3799,  ..., -2.3979, -2.3974, -2.3979]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0063,  0.0005, -0.0061,  ...,  0.0058,  0.0203,  0.0034]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0063,  0.0005, -0.0061,  ...,  0.0058,  0.0203,  0.0034]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 4.3998,  3.7878,  4.8889,  ..., -0.8259, -0.8251, -0.8254]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0113,  0.0221, -0.0044,  ..., -0.0312, -0.0042, -0.0135]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0113,  0.0221, -0.0044,  ..., -0.0312, -0.0042, -0.0135]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 4.0635,  0.1142,  3.3773,  ..., -1.8667, -1.8662, -1.8672]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0018,  0.0068,  0.0076,  ..., -0.0006, -0.0050, -0.0035]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0018,  0.0068,  0.0076,  ..., -0.0006, -0.0050, -0.0035]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[-2.9541, -1.8064, -0.1402,  ..., -1.4772, -1.4775, -1.4780]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0036,  0.0184,  0.0156,  ..., -0.0067,  0.0151,  0.0086]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0036,  0.0184,  0.0156,  ..., -0.0067,  0.0151,  0.0086]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[-7.6021, -4.5455, -2.9761,  ...,  5.3517,  5.3518,  5.3520]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[ 0.0016,  0.0124, -0.0045,  ..., -0.0177,  0.0157, -0.0016]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[ 0.0016,  0.0124, -0.0045,  ..., -0.0177,  0.0157, -0.0016]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 1.7094, -0.8698,  2.3904,  ..., -0.1038, -0.1039, -0.1040]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[ 0.0032, -0.0005, -0.0005,  ..., -0.0008, -0.0034, -0.0003]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[ 0.0032, -0.0005, -0.0005,  ..., -0.0008, -0.0034, -0.0003]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[-1.6520, -4.8953, -0.8332,  ...,  0.9291,  0.9291,  0.9294]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0165,  0.0010, -0.0104,  ..., -0.0103, -0.0261, -0.0034]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0165,  0.0010, -0.0104,  ..., -0.0103, -0.0261, -0.0034]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 1.2453,  0.4375, -1.7249,  ..., -1.6248, -1.6238, -1.6252]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0028,  0.0227, -0.0025,  ..., -0.0023, -0.0020, -0.0031]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0028,  0.0227, -0.0025,  ..., -0.0023, -0.0020, -0.0031]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[-1.4757, -0.7503, -1.9700,  ..., -2.0137, -2.0135, -2.0138]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0033,  0.0122,  0.0004,  ..., -0.0025,  0.0162, -0.0006]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0033,  0.0122,  0.0004,  ..., -0.0025,  0.0162, -0.0006]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 1.7818, -2.3262, -0.9862,  ..., -1.3396, -1.3397, -1.3396]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0033,  0.0215, -0.0002,  ..., -0.0026, -0.0072,  0.0089]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0033,  0.0215, -0.0002,  ..., -0.0026, -0.0072,  0.0089]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[-1.7525, -1.5498, -2.6024,  ..., -1.0834, -1.0838, -1.0845]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0081,  0.0101,  0.0019,  ..., -0.0071, -0.0009,  0.0015]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0081,  0.0101,  0.0019,  ..., -0.0071, -0.0009,  0.0015]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 1.1721, -1.0723, -2.7348,  ...,  1.8423,  1.8424,  1.8424]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0082,  0.0045,  0.0023,  ...,  0.0170,  0.0035, -0.0029]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0082,  0.0045,  0.0023,  ...,  0.0170,  0.0035, -0.0029]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 3.6988, -1.1185,  0.1716,  ..., -2.3506, -2.3497, -2.3500]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0025,  0.0055,  0.0018,  ..., -0.0153,  0.0010,  0.0099]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0025,  0.0055,  0.0018,  ..., -0.0153,  0.0010,  0.0099]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 3.2097,  5.1007,  2.9693,  ..., -1.6334, -1.6341, -1.6340]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0008, -0.0077,  0.0013,  ...,  0.0006, -0.0118, -0.0026]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0008, -0.0077,  0.0013,  ...,  0.0006, -0.0118, -0.0026]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 1.8747,  4.9425,  4.2474,  ..., -1.9139, -1.9134, -1.9132]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0113,  0.0221, -0.0044,  ..., -0.0312, -0.0042, -0.0135]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0113,  0.0221, -0.0044,  ..., -0.0312, -0.0042, -0.0135]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 2.8550,  1.5938,  3.0359,  ..., -3.1838, -3.1840, -3.1834]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0038, -0.0038, -0.0035,  ..., -0.0061,  0.0021,  0.0010]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0038, -0.0038, -0.0035,  ..., -0.0061,  0.0021,  0.0010]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 6.0799, -2.6831, -0.4087,  ..., -1.6919, -1.6918, -1.6921]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0018,  0.0068,  0.0076,  ..., -0.0006, -0.0050, -0.0035]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0018,  0.0068,  0.0076,  ..., -0.0006, -0.0050, -0.0035]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[-1.0556, -4.6657, -3.6295,  ..., -1.8747, -1.8748, -1.8752]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[ 0.0123, -0.0071, -0.0078,  ...,  0.0280,  0.0082,  0.0076]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[ 0.0123, -0.0071, -0.0078,  ...,  0.0280,  0.0082,  0.0076]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[-1.4251, -1.8109,  4.1123,  ...,  1.4486,  1.4477,  1.4479]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0013,  0.0209,  0.0107,  ...,  0.0001, -0.0162,  0.0054]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0013,  0.0209,  0.0107,  ...,  0.0001, -0.0162,  0.0054]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 4.0810,  9.0378,  2.7377,  ..., -1.8266, -1.8266, -1.8270]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0047, -0.0011,  0.0134,  ..., -0.0056, -0.0227, -0.0231]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0047, -0.0011,  0.0134,  ..., -0.0056, -0.0227, -0.0231]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[-0.6055,  7.1547,  9.3454,  ...,  0.8777,  0.8768,  0.8766]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[ 6.5327e-05, -3.0060e-03, -7.2937e-03,  ...,  7.8735e-03,\n",
      "          -7.4208e-06,  2.8229e-03]]], device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[ 6.5327e-05, -3.0060e-03, -7.2937e-03,  ...,  7.8735e-03,\n",
      "          -7.4208e-06,  2.8229e-03]]], device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[-0.7072, -1.0667,  1.8805,  ...,  2.2950,  2.2960,  2.2954]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[ 0.0198,  0.0253, -0.0208,  ...,  0.0056,  0.0219, -0.0152]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[ 0.0198,  0.0253, -0.0208,  ...,  0.0056,  0.0219, -0.0152]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[-0.5326, 11.3828,  4.2622,  ..., -1.1834, -1.1839, -1.1835]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[ 0.0009,  0.0052, -0.0075,  ..., -0.0007, -0.0165,  0.0054]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[ 0.0009,  0.0052, -0.0075,  ..., -0.0007, -0.0165,  0.0054]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 0.2768,  6.0751,  0.3901,  ..., -1.7435, -1.7434, -1.7445]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[ 6.5327e-05, -3.0060e-03, -7.2937e-03,  ...,  7.8735e-03,\n",
      "          -7.4208e-06,  2.8229e-03]]], device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[ 6.5327e-05, -3.0060e-03, -7.2937e-03,  ...,  7.8735e-03,\n",
      "          -7.4208e-06,  2.8229e-03]]], device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 1.0929, -4.3586,  1.4823,  ...,  1.7810,  1.7811,  1.7805]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0028,  0.0109,  0.0155,  ..., -0.0022, -0.0015,  0.0043]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0028,  0.0109,  0.0155,  ..., -0.0022, -0.0015,  0.0043]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[6.5788, 7.3057, 1.1791,  ..., 0.1406, 0.1402, 0.1396]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0047, -0.0011,  0.0134,  ..., -0.0056, -0.0227, -0.0231]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0047, -0.0011,  0.0134,  ..., -0.0056, -0.0227, -0.0231]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[-0.2700,  8.8317, -0.9034,  ...,  1.1384,  1.1383,  1.1383]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[ 6.5327e-05, -3.0060e-03, -7.2937e-03,  ...,  7.8735e-03,\n",
      "          -7.4208e-06,  2.8229e-03]]], device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[ 6.5327e-05, -3.0060e-03, -7.2937e-03,  ...,  7.8735e-03,\n",
      "          -7.4208e-06,  2.8229e-03]]], device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[-0.2098, -6.0259, -1.3421,  ..., -0.5316, -0.5305, -0.5313]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0195, -0.0029,  0.0133,  ..., -0.0145,  0.0056,  0.0117]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0195, -0.0029,  0.0133,  ..., -0.0145,  0.0056,  0.0117]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 0.7094,  9.8060, -0.5355,  ..., -2.3045, -2.3037, -2.3045]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[ 0.0009,  0.0052, -0.0075,  ..., -0.0007, -0.0165,  0.0054]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[ 0.0009,  0.0052, -0.0075,  ..., -0.0007, -0.0165,  0.0054]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[ 0.9747,  6.3509,  0.2488,  ..., -1.8615, -1.8607, -1.8613]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[ 6.5327e-05, -3.0060e-03, -7.2937e-03,  ...,  7.8735e-03,\n",
      "          -7.4208e-06,  2.8229e-03]]], device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[ 6.5327e-05, -3.0060e-03, -7.2937e-03,  ...,  7.8735e-03,\n",
      "          -7.4208e-06,  2.8229e-03]]], device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[-0.6106, -3.9756,  1.0150,  ...,  0.9849,  0.9844,  0.9840]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0187,  0.0025, -0.0248,  ...,  0.0033, -0.0015,  0.0059]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0187,  0.0025, -0.0248,  ...,  0.0033, -0.0015,  0.0059]]],\n",
      "       device='cuda:0')\n",
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[2.0735, 2.8638, 3.4287,  ..., 1.1830, 1.1828, 1.1821]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n",
      "inputs_embeds tensor(0, device='cuda:0') tensor([[[-0.0047, -0.0011,  0.0134,  ..., -0.0056, -0.0227, -0.0231]]],\n",
      "       device='cuda:0')\n",
      "inputs_embeds2 tensor(0, device='cuda:0') tensor([[[-0.0047, -0.0011,  0.0134,  ..., -0.0056, -0.0227, -0.0231]]],\n",
      "       device='cuda:0')\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  0%|          | 0/434 [00:11<?, ?it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "outputs CausalLMOutputWithPast(loss=None, logits=tensor([[[-2.4025,  6.4604,  6.7386,  ...,  1.0645,  1.0639,  1.0639]]],\n",
      "       device='cuda:0'), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f5c4c9385f0>, hidden_states=None, attentions=None)\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "for batch_idx, batch in enumerate(tqdm(dataloader)):\n",
    "    audio_path = batch.pop('audio_path')\n",
    "    batch = {k: v.to(\"cuda\") for k, v in batch.items() if type(v)!=type(None)}\n",
    "    with torch.inference_mode():\n",
    "        \n",
    "        generate_ids = model.generate(**batch, \n",
    "        max_new_tokens=256,\n",
    "        temperature = 0.001, top_p = 0.95, top_k = 64, do_sample=True\n",
    "        )\n",
    "        \n",
    "        batch_inp = processor.batch_decode(\n",
    "            batch['input_ids'], skip_special_tokens=True, clean_up_tokenization_spaces=False\n",
    "        )\n",
    "        batch_predictions = processor.batch_decode(\n",
    "            generate_ids[:, batch['input_ids'].shape[1]:], skip_special_tokens=True, clean_up_tokenization_spaces=False\n",
    "        )\n",
    "        batch_references = processor.batch_decode(\n",
    "            batch['labels'], skip_special_tokens=True, clean_up_tokenization_spaces=False\n",
    "        )\n",
    "    break"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "a = list(model.audio_projector.state_dict().values())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(torch.Size([3072, 1024]), torch.Size([3072]))"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "a[0].shape,a[1].shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(tensor([[[-0.3780, -0.7710,  0.3672,  ..., -0.5870,  0.4069,  0.8486],\n",
       "          [ 0.1079,  0.1348, -0.6116,  ..., -2.2154,  0.5705,  0.8937],\n",
       "          [-1.0184,  1.8919,  0.3304,  ...,  0.3861,  0.6337, -0.4413],\n",
       "          ...,\n",
       "          [-0.0304,  0.0203, -0.0488,  ...,  0.0108, -0.0134,  0.0664],\n",
       "          [-1.0408,  0.2857,  0.1969,  ...,  0.0895, -1.0475,  0.4363],\n",
       "          [-1.8609, -0.4888, -0.2397,  ..., -1.5569, -1.0248, -0.4421]]],\n",
       "        device='cuda:0', grad_fn=<CheckpointFunctionBackward>),\n",
       " None)"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "\n",
    "a=model.audio_tower(batch['input_audio_embeds'], None)\n",
    "a"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/mnt/jeff/huggingface/modules/transformers_modules/gemma-3-4b-it-omni/speech_conformer_encoder.py:2775: FutureWarning: Please specify CheckpointImpl.NO_REENTRANT as CheckpointImpl.REENTRANT will soon be removed as the default and eventually deprecated.\n",
      "  lambda i: encoder_checkpoint_wrapper(\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "######################## speech lora #############\n",
      "######################## text lora #############\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "e466e7e5b7de464d83c1539f0cb8f93f",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Some weights of Gemma3OmniForConditionalGeneration were not initialized from the model checkpoint at /mnt/jeff/InCar/Gemma3omni/gemma-3-4b-it-omni and are newly initialized: ['language_model.model.base_model.model.layers.0.mlp.down_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.0.mlp.down_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.0.mlp.gate_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.0.mlp.gate_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.0.mlp.up_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.0.mlp.up_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.0.self_attn.k_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.0.self_attn.k_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.0.self_attn.o_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.0.self_attn.o_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.0.self_attn.q_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.0.self_attn.q_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.0.self_attn.v_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.0.self_attn.v_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.1.mlp.down_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.1.mlp.down_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.1.mlp.gate_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.1.mlp.gate_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.1.mlp.up_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.1.mlp.up_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.1.self_attn.k_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.1.self_attn.k_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.1.self_attn.o_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.1.self_attn.o_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.1.self_attn.q_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.1.self_attn.q_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.1.self_attn.v_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.1.self_attn.v_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.10.mlp.down_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.10.mlp.down_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.10.mlp.gate_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.10.mlp.gate_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.10.mlp.up_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.10.mlp.up_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.10.self_attn.k_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.10.self_attn.k_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.10.self_attn.o_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.10.self_attn.o_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.10.self_attn.q_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.10.self_attn.q_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.10.self_attn.v_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.10.self_attn.v_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.11.mlp.down_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.11.mlp.down_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.11.mlp.gate_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.11.mlp.gate_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.11.mlp.up_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.11.mlp.up_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.11.self_attn.k_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.11.self_attn.k_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.11.self_attn.o_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.11.self_attn.o_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.11.self_attn.q_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.11.self_attn.q_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.11.self_attn.v_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.11.self_attn.v_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.12.mlp.down_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.12.mlp.down_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.12.mlp.gate_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.12.mlp.gate_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.12.mlp.up_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.12.mlp.up_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.12.self_attn.k_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.12.self_attn.k_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.12.self_attn.o_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.12.self_attn.o_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.12.self_attn.q_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.12.self_attn.q_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.12.self_attn.v_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.12.self_attn.v_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.13.mlp.down_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.13.mlp.down_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.13.mlp.gate_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.13.mlp.gate_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.13.mlp.up_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.13.mlp.up_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.13.self_attn.k_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.13.self_attn.k_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.13.self_attn.o_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.13.self_attn.o_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.13.self_attn.q_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.13.self_attn.q_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.13.self_attn.v_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.13.self_attn.v_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.14.mlp.down_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.14.mlp.down_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.14.mlp.gate_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.14.mlp.gate_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.14.mlp.up_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.14.mlp.up_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.14.self_attn.k_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.14.self_attn.k_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.14.self_attn.o_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.14.self_attn.o_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.14.self_attn.q_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.14.self_attn.q_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.14.self_attn.v_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.14.self_attn.v_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.15.mlp.down_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.15.mlp.down_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.15.mlp.gate_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.15.mlp.gate_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.15.mlp.up_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.15.mlp.up_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.15.self_attn.k_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.15.self_attn.k_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.15.self_attn.o_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.15.self_attn.o_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.15.self_attn.q_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.15.self_attn.q_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.15.self_attn.v_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.15.self_attn.v_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.16.mlp.down_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.16.mlp.down_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.16.mlp.gate_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.16.mlp.gate_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.16.mlp.up_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.16.mlp.up_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.16.self_attn.k_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.16.self_attn.k_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.16.self_attn.o_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.16.self_attn.o_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.16.self_attn.q_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.16.self_attn.q_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.16.self_attn.v_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.16.self_attn.v_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.17.mlp.down_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.17.mlp.down_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.17.mlp.gate_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.17.mlp.gate_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.17.mlp.up_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.17.mlp.up_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.17.self_attn.k_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.17.self_attn.k_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.17.self_attn.o_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.17.self_attn.o_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.17.self_attn.q_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.17.self_attn.q_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.17.self_attn.v_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.17.self_attn.v_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.18.mlp.down_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.18.mlp.down_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.18.mlp.gate_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.18.mlp.gate_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.18.mlp.up_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.18.mlp.up_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.18.self_attn.k_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.18.self_attn.k_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.18.self_attn.o_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.18.self_attn.o_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.18.self_attn.q_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.18.self_attn.q_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.18.self_attn.v_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.18.self_attn.v_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.19.mlp.down_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.19.mlp.down_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.19.mlp.gate_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.19.mlp.gate_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.19.mlp.up_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.19.mlp.up_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.19.self_attn.k_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.19.self_attn.k_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.19.self_attn.o_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.19.self_attn.o_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.19.self_attn.q_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.19.self_attn.q_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.19.self_attn.v_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.19.self_attn.v_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.2.mlp.down_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.2.mlp.down_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.2.mlp.gate_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.2.mlp.gate_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.2.mlp.up_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.2.mlp.up_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.2.self_attn.k_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.2.self_attn.k_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.2.self_attn.o_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.2.self_attn.o_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.2.self_attn.q_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.2.self_attn.q_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.2.self_attn.v_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.2.self_attn.v_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.20.mlp.down_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.20.mlp.down_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.20.mlp.gate_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.20.mlp.gate_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.20.mlp.up_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.20.mlp.up_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.20.self_attn.k_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.20.self_attn.k_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.20.self_attn.o_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.20.self_attn.o_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.20.self_attn.q_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.20.self_attn.q_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.20.self_attn.v_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.20.self_attn.v_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.21.mlp.down_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.21.mlp.down_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.21.mlp.gate_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.21.mlp.gate_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.21.mlp.up_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.21.mlp.up_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.21.self_attn.k_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.21.self_attn.k_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.21.self_attn.o_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.21.self_attn.o_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.21.self_attn.q_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.21.self_attn.q_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.21.self_attn.v_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.21.self_attn.v_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.22.mlp.down_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.22.mlp.down_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.22.mlp.gate_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.22.mlp.gate_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.22.mlp.up_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.22.mlp.up_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.22.self_attn.k_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.22.self_attn.k_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.22.self_attn.o_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.22.self_attn.o_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.22.self_attn.q_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.22.self_attn.q_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.22.self_attn.v_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.22.self_attn.v_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.23.mlp.down_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.23.mlp.down_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.23.mlp.gate_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.23.mlp.gate_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.23.mlp.up_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.23.mlp.up_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.23.self_attn.k_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.23.self_attn.k_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.23.self_attn.o_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.23.self_attn.o_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.23.self_attn.q_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.23.self_attn.q_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.23.self_attn.v_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.23.self_attn.v_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.24.mlp.down_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.24.mlp.down_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.24.mlp.gate_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.24.mlp.gate_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.24.mlp.up_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.24.mlp.up_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.24.self_attn.k_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.24.self_attn.k_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.24.self_attn.o_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.24.self_attn.o_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.24.self_attn.q_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.24.self_attn.q_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.24.self_attn.v_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.24.self_attn.v_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.25.mlp.down_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.25.mlp.down_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.25.mlp.gate_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.25.mlp.gate_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.25.mlp.up_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.25.mlp.up_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.25.self_attn.k_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.25.self_attn.k_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.25.self_attn.o_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.25.self_attn.o_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.25.self_attn.q_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.25.self_attn.q_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.25.self_attn.v_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.25.self_attn.v_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.26.mlp.down_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.26.mlp.down_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.26.mlp.gate_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.26.mlp.gate_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.26.mlp.up_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.26.mlp.up_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.26.self_attn.k_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.26.self_attn.k_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.26.self_attn.o_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.26.self_attn.o_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.26.self_attn.q_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.26.self_attn.q_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.26.self_attn.v_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.26.self_attn.v_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.27.mlp.down_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.27.mlp.down_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.27.mlp.gate_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.27.mlp.gate_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.27.mlp.up_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.27.mlp.up_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.27.self_attn.k_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.27.self_attn.k_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.27.self_attn.o_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.27.self_attn.o_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.27.self_attn.q_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.27.self_attn.q_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.27.self_attn.v_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.27.self_attn.v_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.28.mlp.down_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.28.mlp.down_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.28.mlp.gate_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.28.mlp.gate_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.28.mlp.up_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.28.mlp.up_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.28.self_attn.k_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.28.self_attn.k_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.28.self_attn.o_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.28.self_attn.o_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.28.self_attn.q_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.28.self_attn.q_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.28.self_attn.v_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.28.self_attn.v_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.29.mlp.down_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.29.mlp.down_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.29.mlp.gate_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.29.mlp.gate_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.29.mlp.up_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.29.mlp.up_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.29.self_attn.k_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.29.self_attn.k_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.29.self_attn.o_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.29.self_attn.o_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.29.self_attn.q_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.29.self_attn.q_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.29.self_attn.v_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.29.self_attn.v_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.3.mlp.down_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.3.mlp.down_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.3.mlp.gate_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.3.mlp.gate_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.3.mlp.up_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.3.mlp.up_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.3.self_attn.k_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.3.self_attn.k_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.3.self_attn.o_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.3.self_attn.o_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.3.self_attn.q_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.3.self_attn.q_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.3.self_attn.v_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.3.self_attn.v_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.30.mlp.down_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.30.mlp.down_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.30.mlp.gate_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.30.mlp.gate_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.30.mlp.up_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.30.mlp.up_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.30.self_attn.k_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.30.self_attn.k_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.30.self_attn.o_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.30.self_attn.o_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.30.self_attn.q_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.30.self_attn.q_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.30.self_attn.v_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.30.self_attn.v_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.31.mlp.down_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.31.mlp.down_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.31.mlp.gate_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.31.mlp.gate_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.31.mlp.up_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.31.mlp.up_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.31.self_attn.k_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.31.self_attn.k_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.31.self_attn.o_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.31.self_attn.o_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.31.self_attn.q_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.31.self_attn.q_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.31.self_attn.v_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.31.self_attn.v_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.32.mlp.down_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.32.mlp.down_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.32.mlp.gate_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.32.mlp.gate_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.32.mlp.up_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.32.mlp.up_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.32.self_attn.k_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.32.self_attn.k_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.32.self_attn.o_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.32.self_attn.o_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.32.self_attn.q_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.32.self_attn.q_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.32.self_attn.v_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.32.self_attn.v_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.33.mlp.down_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.33.mlp.down_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.33.mlp.gate_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.33.mlp.gate_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.33.mlp.up_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.33.mlp.up_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.33.self_attn.k_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.33.self_attn.k_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.33.self_attn.o_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.33.self_attn.o_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.33.self_attn.q_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.33.self_attn.q_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.33.self_attn.v_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.33.self_attn.v_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.4.mlp.down_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.4.mlp.down_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.4.mlp.gate_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.4.mlp.gate_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.4.mlp.up_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.4.mlp.up_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.4.self_attn.k_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.4.self_attn.k_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.4.self_attn.o_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.4.self_attn.o_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.4.self_attn.q_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.4.self_attn.q_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.4.self_attn.v_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.4.self_attn.v_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.5.mlp.down_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.5.mlp.down_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.5.mlp.gate_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.5.mlp.gate_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.5.mlp.up_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.5.mlp.up_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.5.self_attn.k_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.5.self_attn.k_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.5.self_attn.o_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.5.self_attn.o_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.5.self_attn.q_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.5.self_attn.q_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.5.self_attn.v_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.5.self_attn.v_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.6.mlp.down_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.6.mlp.down_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.6.mlp.gate_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.6.mlp.gate_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.6.mlp.up_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.6.mlp.up_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.6.self_attn.k_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.6.self_attn.k_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.6.self_attn.o_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.6.self_attn.o_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.6.self_attn.q_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.6.self_attn.q_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.6.self_attn.v_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.6.self_attn.v_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.7.mlp.down_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.7.mlp.down_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.7.mlp.gate_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.7.mlp.gate_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.7.mlp.up_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.7.mlp.up_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.7.self_attn.k_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.7.self_attn.k_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.7.self_attn.o_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.7.self_attn.o_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.7.self_attn.q_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.7.self_attn.q_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.7.self_attn.v_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.7.self_attn.v_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.8.mlp.down_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.8.mlp.down_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.8.mlp.gate_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.8.mlp.gate_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.8.mlp.up_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.8.mlp.up_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.8.self_attn.k_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.8.self_attn.k_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.8.self_attn.o_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.8.self_attn.o_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.8.self_attn.q_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.8.self_attn.q_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.8.self_attn.v_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.8.self_attn.v_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.9.mlp.down_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.9.mlp.down_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.9.mlp.gate_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.9.mlp.gate_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.9.mlp.up_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.9.mlp.up_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.9.self_attn.k_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.9.self_attn.k_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.9.self_attn.o_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.9.self_attn.o_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.9.self_attn.q_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.9.self_attn.q_proj.lora_B.text.weight', 'language_model.model.base_model.model.layers.9.self_attn.v_proj.lora_A.text.weight', 'language_model.model.base_model.model.layers.9.self_attn.v_proj.lora_B.text.weight']\n",
      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
     ]
    }
   ],
   "source": [
    "model_id = \"/mnt/jeff/InCar/Gemma3omni/gemma-3-4b-it-omni\"\n",
    "revision = \"main\" #\"v1.0\"\n",
    "\n",
    "model2 = AutoModel.from_pretrained(\n",
    "    model_id, device_map=\"cuda\", revision = revision, trust_remote_code=True\n",
    ").eval()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(torch.Size([2560, 1024]), torch.Size([2560]))"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "b = list(model2.audio_projector.state_dict().values())\n",
    "b[0].shape,b[1].shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[tensor([[ 0.0080, -0.0159, -0.0219,  ...,  0.0051, -0.0021, -0.0060],\n",
       "         [-0.0007, -0.0223, -0.0045,  ..., -0.0062,  0.0105,  0.0078],\n",
       "         [-0.0025, -0.0088,  0.0214,  ...,  0.0082,  0.0203, -0.0014],\n",
       "         ...,\n",
       "         [-0.0037,  0.0070,  0.0036,  ...,  0.0029,  0.0449, -0.0284],\n",
       "         [-0.0010, -0.0221, -0.0238,  ..., -0.0029, -0.0071,  0.0167],\n",
       "         [-0.0194, -0.0090, -0.0015,  ...,  0.0216, -0.0037,  0.0173]],\n",
       "        device='cuda:0'),\n",
       " tensor([0.0046, 0.0012, 0.0061,  ..., 0.0063, 0.0012, 0.0006], device='cuda:0'),\n",
       " tensor([[ 0.0442, -0.0010, -0.0430,  ...,  0.0108, -0.0195,  0.0104],\n",
       "         [ 0.0238,  0.0267, -0.0386,  ...,  0.0168, -0.0085, -0.0162],\n",
       "         [ 0.0073,  0.0107, -0.0245,  ...,  0.0154, -0.0144, -0.0066],\n",
       "         ...,\n",
       "         [ 0.0051, -0.0020,  0.0070,  ...,  0.0317,  0.0090,  0.0157],\n",
       "         [ 0.0171,  0.0206, -0.0173,  ...,  0.0034, -0.0194, -0.0054],\n",
       "         [ 0.0165, -0.0284,  0.0238,  ...,  0.0312, -0.0391,  0.0175]],\n",
       "        device='cuda:0'),\n",
       " tensor([-0.0039,  0.0003, -0.0019,  ..., -0.0058,  0.0032, -0.0073],\n",
       "        device='cuda:0')]"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "b"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "base",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}