Style transfer model trained on 225 batch letters - SacreBLEU score of 82 point 45

Browse files

Files changed (14) hide show

added_tokens.json +206 -0
config.json +35 -0
evaluation_new_model_output_text.txt +1 -0
evaluation_original_model_text.txt +0 -0
generation_config.json +8 -0
model.safetensors +3 -0
optimizer.pt +3 -0
rng_state.pth +3 -0
scheduler.pt +3 -0
sentencepiece.bpe.model +3 -0
special_tokens_map.json +54 -0
tokenizer_config.json +1686 -0
trainer_state.json +978 -0
training_args.bin +3 -0

added_tokens.json ADDED Viewed

	@@ -0,0 +1,206 @@

+{
+  "<mask>": 256203,
+  "ace_Arab": 256001,
+  "ace_Latn": 256002,
+  "acm_Arab": 256003,
+  "acq_Arab": 256004,
+  "aeb_Arab": 256005,
+  "afr_Latn": 256006,
+  "ajp_Arab": 256007,
+  "aka_Latn": 256008,
+  "als_Latn": 256162,
+  "amh_Ethi": 256009,
+  "and_Cas": 256204,
+  "apc_Arab": 256010,
+  "arb_Arab": 256011,
+  "ars_Arab": 256012,
+  "ary_Arab": 256013,
+  "arz_Arab": 256014,
+  "asm_Beng": 256015,
+  "ast_Latn": 256016,
+  "awa_Deva": 256017,
+  "ayr_Latn": 256018,
+  "azb_Arab": 256019,
+  "azj_Latn": 256020,
+  "bak_Cyrl": 256021,
+  "bam_Latn": 256022,
+  "ban_Latn": 256023,
+  "bel_Cyrl": 256024,
+  "bem_Latn": 256025,
+  "ben_Beng": 256026,
+  "bho_Deva": 256027,
+  "bjn_Arab": 256028,
+  "bjn_Latn": 256029,
+  "bod_Tibt": 256030,
+  "bos_Latn": 256031,
+  "bug_Latn": 256032,
+  "bul_Cyrl": 256033,
+  "cat_Latn": 256034,
+  "ceb_Latn": 256035,
+  "ces_Latn": 256036,
+  "cjk_Latn": 256037,
+  "ckb_Arab": 256038,
+  "crh_Latn": 256039,
+  "cym_Latn": 256040,
+  "dan_Latn": 256041,
+  "deu_Latn": 256042,
+  "dik_Latn": 256043,
+  "dyu_Latn": 256044,
+  "dzo_Tibt": 256045,
+  "ell_Grek": 256046,
+  "eng_Latn": 256047,
+  "epo_Latn": 256048,
+  "est_Latn": 256049,
+  "eus_Latn": 256050,
+  "ewe_Latn": 256051,
+  "fao_Latn": 256052,
+  "fij_Latn": 256054,
+  "fin_Latn": 256055,
+  "fon_Latn": 256056,
+  "fra_Latn": 256057,
+  "fur_Latn": 256058,
+  "fuv_Latn": 256059,
+  "gaz_Latn": 256135,
+  "gla_Latn": 256060,
+  "gle_Latn": 256061,
+  "glg_Latn": 256062,
+  "grn_Latn": 256063,
+  "guj_Gujr": 256064,
+  "hat_Latn": 256065,
+  "hau_Latn": 256066,
+  "heb_Hebr": 256067,
+  "hin_Deva": 256068,
+  "hne_Deva": 256069,
+  "hrv_Latn": 256070,
+  "hun_Latn": 256071,
+  "hye_Armn": 256072,
+  "ibo_Latn": 256073,
+  "ilo_Latn": 256074,
+  "ind_Latn": 256075,
+  "isl_Latn": 256076,
+  "ita_Latn": 256077,
+  "jav_Latn": 256078,
+  "jpn_Jpan": 256079,
+  "kab_Latn": 256080,
+  "kac_Latn": 256081,
+  "kam_Latn": 256082,
+  "kan_Knda": 256083,
+  "kas_Arab": 256084,
+  "kas_Deva": 256085,
+  "kat_Geor": 256086,
+  "kaz_Cyrl": 256089,
+  "kbp_Latn": 256090,
+  "kea_Latn": 256091,
+  "khk_Cyrl": 256122,
+  "khm_Khmr": 256092,
+  "kik_Latn": 256093,
+  "kin_Latn": 256094,
+  "kir_Cyrl": 256095,
+  "kmb_Latn": 256096,
+  "kmr_Latn": 256099,
+  "knc_Arab": 256087,
+  "knc_Latn": 256088,
+  "kon_Latn": 256097,
+  "kor_Hang": 256098,
+  "lao_Laoo": 256100,
+  "lij_Latn": 256102,
+  "lim_Latn": 256103,
+  "lin_Latn": 256104,
+  "lit_Latn": 256105,
+  "lmo_Latn": 256106,
+  "ltg_Latn": 256107,
+  "ltz_Latn": 256108,
+  "lua_Latn": 256109,
+  "lug_Latn": 256110,
+  "luo_Latn": 256111,
+  "lus_Latn": 256112,
+  "lvs_Latn": 256101,
+  "mag_Deva": 256113,
+  "mai_Deva": 256114,
+  "mal_Mlym": 256115,
+  "mar_Deva": 256116,
+  "min_Latn": 256117,
+  "mkd_Cyrl": 256118,
+  "mlt_Latn": 256120,
+  "mni_Beng": 256121,
+  "mos_Latn": 256123,
+  "mri_Latn": 256124,
+  "mya_Mymr": 256126,
+  "nld_Latn": 256127,
+  "nno_Latn": 256128,
+  "nob_Latn": 256129,
+  "npi_Deva": 256130,
+  "nso_Latn": 256131,
+  "nus_Latn": 256132,
+  "nya_Latn": 256133,
+  "oci_Latn": 256134,
+  "ory_Orya": 256136,
+  "pag_Latn": 256137,
+  "pan_Guru": 256138,
+  "pap_Latn": 256139,
+  "pbt_Arab": 256143,
+  "pes_Arab": 256053,
+  "plt_Latn": 256119,
+  "pol_Latn": 256140,
+  "por_Latn": 256141,
+  "prs_Arab": 256142,
+  "quy_Latn": 256144,
+  "ron_Latn": 256145,
+  "run_Latn": 256146,
+  "rus_Cyrl": 256147,
+  "sag_Latn": 256148,
+  "san_Deva": 256149,
+  "sat_Beng": 256150,
+  "scn_Latn": 256151,
+  "shn_Mymr": 256152,
+  "sin_Sinh": 256153,
+  "slk_Latn": 256154,
+  "slv_Latn": 256155,
+  "smo_Latn": 256156,
+  "sna_Latn": 256157,
+  "snd_Arab": 256158,
+  "som_Latn": 256159,
+  "sot_Latn": 256160,
+  "spa_Latn": 256161,
+  "srd_Latn": 256163,
+  "srp_Cyrl": 256164,
+  "ssw_Latn": 256165,
+  "sun_Latn": 256166,
+  "swe_Latn": 256167,
+  "swh_Latn": 256168,
+  "szl_Latn": 256169,
+  "tam_Taml": 256170,
+  "taq_Latn": 256177,
+  "taq_Tfng": 256178,
+  "tat_Cyrl": 256171,
+  "tel_Telu": 256172,
+  "tgk_Cyrl": 256173,
+  "tgl_Latn": 256174,
+  "tha_Thai": 256175,
+  "tir_Ethi": 256176,
+  "tpi_Latn": 256179,
+  "tsn_Latn": 256180,
+  "tso_Latn": 256181,
+  "tuk_Latn": 256182,
+  "tum_Latn": 256183,
+  "tur_Latn": 256184,
+  "twi_Latn": 256185,
+  "tzm_Tfng": 256186,
+  "uig_Arab": 256187,
+  "ukr_Cyrl": 256188,
+  "umb_Latn": 256189,
+  "urd_Arab": 256190,
+  "uzn_Latn": 256191,
+  "vec_Latn": 256192,
+  "vie_Latn": 256193,
+  "war_Latn": 256194,
+  "wol_Latn": 256195,
+  "xho_Latn": 256196,
+  "ydd_Hebr": 256197,
+  "yor_Latn": 256198,
+  "yue_Hant": 256199,
+  "zho_Hans": 256200,
+  "zho_Hant": 256201,
+  "zsm_Latn": 256125,
+  "zul_Latn": 256202
+}

config.json ADDED Viewed

	@@ -0,0 +1,35 @@

+{
+  "_name_or_path": "facebook/nllb-200-distilled-600M",
+  "activation_dropout": 0.0,
+  "activation_function": "relu",
+  "architectures": [
+    "M2M100ForConditionalGeneration"
+  ],
+  "attention_dropout": 0.1,
+  "bos_token_id": 0,
+  "d_model": 1024,
+  "decoder_attention_heads": 16,
+  "decoder_ffn_dim": 4096,
+  "decoder_layerdrop": 0,
+  "decoder_layers": 12,
+  "decoder_start_token_id": 2,
+  "dropout": 0.1,
+  "encoder_attention_heads": 16,
+  "encoder_ffn_dim": 4096,
+  "encoder_layerdrop": 0,
+  "encoder_layers": 12,
+  "eos_token_id": 2,
+  "init_std": 0.02,
+  "is_encoder_decoder": true,
+  "max_length": null,
+  "max_position_embeddings": 1024,
+  "model_type": "m2m_100",
+  "num_hidden_layers": 12,
+  "pad_token_id": 1,
+  "scale_embedding": true,
+  "tokenizer_class": "NllbTokenizer",
+  "torch_dtype": "float32",
+  "transformers_version": "4.49.0",
+  "use_cache": true,
+  "vocab_size": 256205
+}

evaluation_new_model_output_text.txt ADDED Viewed

	@@ -0,0 +1 @@

+ I'm thrilled to announce the NeurIPS Data-Centric AI Workshop, which will take place on December 14, 2021.<<<DELIMITER>>>I wrote about data-centric AI, the way we systematically engineer the data to inform machine learning algorithms.<<<DELIMITER>>>This workshop will give you a chance to dive deeper into the subject. I intended to hold this forum as a forum for experts in the field to share ideas and ideas.<<<DELIMITER>>>This technology eventually becomes widely adopted and is used at AI conferences. Even as deep learning was a topic of conversation at NeurIPS, my collaborators and I organized workshops that encouraged discussion and progress. While data-centric AI is making a difference in practical applications, much more research remains to be done.<<<DELIMITER>>>One common misconception is that data-centric AI simply means that the data you use to train algorithms on is engineered.<<<DELIMITER>>>But at the same time, we need broad principles, rigorous procedures, and practical tools that will enable us to apply this principle consistently and systematically.<<<DELIMITER>>>The tools TensorFlow and PyTorch use have made it much easier to design neural network architectures without causing errors, and tools that can make data engineering more efficient. As part of a larger effort to build an MLOps platform for computer vision, my team at Landing AI (where jobs are available today) is working on data-centric algorithms for image vision.<<<DELIMITER>>>Welcome to lots of new groups dedicated to data-centric algorithmic breakthroughs. . Here are some ideas that I would like to dive into:. Identifying algorithms (or tools) that can improve the quality of data sourcing. Developing ways to detect and address inconsistencies in labeled data. Develop principles of design that help systematically improve data quality. Develop tools that help practitioners optimize error analysis. Explore how data engineering can contribute to AI use, for example, in developing models that are trained to be fairer and less biased. The deadline for<<<DELIMITER>>>You can learn more about it here. Thank you to the co-organizers Lora Aroyo, Cody Coleman, Greg Diamos, Vijay Janapa Reddi, Joaquin Vanschoren, and Sharon Zhou.<<<DELIMITER>>>I attended the Computer Vision and Pattern Recognition (CVPR) conference in Vancouver, Canada, from Sunday through Tuesday, where around 4,000 delegates attended.<<<DELIMITER>>>It feels great to see this new wave of face-to-face conferences with the pandemic lifting of restrictions. Computer vision is taking off.<<<DELIMITER>>>The natural language processing community had been talking about transformers for years, but ChatGPT didn't gain widespread traction.<<<DELIMITER>>>I saw in CVPR a lot of people excited about computer vision.<<<DELIMITER>>>So far, so many papers can be summarized in minutes, but among the milestones: Vision transformers are gaining traction.What's new: My blog post went so far as to call this a topic and it's still going strong.<<<DELIMITER>>>ViT debuted in a paper in 2020, and since then it has established itself as a viable alternative to convolutional neural networks.<<<DELIMITER>>>However, there are other challenges that need to be addressed.<<<DELIMITER>>>For example, turning a visual data into a sequence of tokens is a very simple matter, but turning an image segmentation, filtering, or object detection task into a token-based prediction involves many tasks.<<<DELIMITER>>>Many researchers are working on other techniques as well. Image generation: Algorithms that generate images have come to dominate CVPR's computer vision scene since the invention of GANs and diffusion models.<<<DELIMITER>>>This year, I've seen exciting progress on generating creative edits for images and getting more accurate control over the output of models.<<<DELIMITER>>>I've noticed a lot of attention focused on developing realistic faces which is understandable given the fascination that humans have with face combinations, and that NeRF-based approaches to generating 3D scenes from 2D images have gained traction over the past year or so, as we discussed in an earlier issue of The Batch.<<<DELIMITER>>>But I was surprised by the volume of research on NeRF that was presented.<<<DELIMITER>>>NeRF can scale up to larger environments, become more efficient, and incorporate dynamic scenes. How to use fewer input images while still accuracy. What's new: The golden age of multimodal models started.<<<DELIMITER>>>In contrast, transformers can take many forms of input, from images to text, and convert those dissimilar sources into a sequence of tokens that feed all transformers simultaneously.<<<DELIMITER>>>Many teams are working on this architecture as well. Even though it has taken longer than some to achieve self-driving cars, a lot of research is still in progress on this topic.<<<DELIMITER>>>I believe that large, pretrained transformers will drive tremendous progress in self-driving cars. . I spoke at CVPR's Computer Vision in the Wild workshop, where I chatted with the community about Landing AI's pioneering approach to simplifying computer vision using visual prompting.<<<DELIMITER>>>Thanks to Jianwei Yang, Jianfeng Gao, and the other organizers for inviting us to speak! Afterward, I met many people who work in computer vision and realized that there is a huge gap between the problems and benchmarks studied in academic research and those encountered in commercial applications.<<<DELIMITER>>>For example:. Data validation processes take precedence in theory studies over practical applications. I'll talk about this in a future letter.<<<DELIMITER>>>On the LMSYS Chatbot Arena Leaderboard, where anonymous chatbots compete against each other and ask users to judge which is more accurate, Google's Bard (Gemini Pro) climbed to No. 3, filling the gap quickly with OpenAI's GPT-4, currently the top seed.<<<DELIMITER>>>Currently, the open source Mixtral-8x7b-Instruct is the 11th largest open-source model of its kind, behind GPT-3.5-Turbo.<<<DELIMITER>>>Many smaller, but capable teams seem to have the technical skills to build foundation models.<<<DELIMITER>>>I believe that in 2024 many new teams will be coming into the arena with their own unique solutions. The barriers to building large language models are fading as deep expertise in training such systems becomes more widespread.<<<DELIMITER>>>Large language models (LLMs) are making significant progress toward becoming ubiquitous.<<<DELIMITER>>>There are many potential consequences of commoditization: Who gets it and who loses it? Meta, which prioritizes open source, is helping the industry.<<<DELIMITER>>>Unlike many of its competitors, it generates revenue primarily through ads on its platform. Neither sells LLM API subscriptions.<<<DELIMITER>>>Meta's use of iOS and Android recently left the company at risk of further privacy restrictions by Apple and Google, which in turn limits Meta's ability to deliver targeted ads. Thus, there's a lot of incentive for Meta to push for more open platforms that offer greater flexibility and aren't dominated by any single company.<<<DELIMITER>>>The strategic release of Llama as open source is in keeping with the company's business goals, and they've steadfastly defended PyTorch against Google's prominent rival.<<<DELIMITER>>>The resulting open source offerings are beneficial to the broader AI community and to the dissemination of knowledge, while companies like Google Cloud and Microsoft Azure could be more valuable if they were able to provide dominant, closed source LLMs that fit tightly into their cloud services.<<<DELIMITER>>>This can help further grow their cloud business.<<<DELIMITER>>>The big players in the cloud, including Google Cloud, Azure, and Amazon Web Services, are in a good position to build thriving businesses by integrating LLM API requests into their cloud services.<<<DELIMITER>>>But their cloud services will perform well enough even if they don't build a Gemini, GPT-4, or later version of a large language model that delivers a clear edge..<<<DELIMITER>>>If LLMs become increasingly marketed, they should be able to incorporate any new LLMs they find in their API portfolio. . The fact that LLMs are open or closed source opens companies to a variety of opportunities to integrate them into their existing product lines.<<<DELIMITER>>>For example:. Microsoft is using a sales team to market its software to corporate customers.<<<DELIMITER>>>The sales representatives are a powerful force in promoting Copilot, which is integrated with office productivity software.<<<DELIMITER>>>In contrast, Google's core business may be in jeopardy, as some users view querying an LLM as a complement to, say, web search.<<<DELIMITER>>>Despite its recent adoption, the technology is gaining traction.<<<DELIMITER>>>Meta will be tremendously beneficial as LLMs become widely available..<<<DELIMITER>>>Even today, LLMs are helping to generate more click-throughs in online advertising, and tech giants can afford to invest hundreds of millions, even billions of dollars, in developing an LLM even if it soon turns into a commodity.<<<DELIMITER>>>No doubt they'll have a harder time navigating the hardship of spending heavily on projects without success.<<<DELIMITER>>>But other bright startups will have their fair share of revenue, and competition among companies that offer large language models will drive innovation, which will benefit developers who build applications using these technologies. With huge investments coming in from established companies and startups, I believe that many exciting businesses can build applications on top of LLMs.<<<DELIMITER>>>Two years ago, I switched from a good old phone to a new one.How can I take long-term photos of Nova while she is sleeping and the room is dimly lit? I was still learning how to use the camera's current capabilities.<<<DELIMITER>>>This technology gave me thrill and excitement. In an earlier letter on ethics in AI development, I wrote about how to frame ethical principles into concrete, practical principles.<<<DELIMITER>>>Desencluttering expert Marie Kondo recommends a simple yet effective way to organize households.What she calls joy or joy should be removed from your living room.<<<DELIMITER>>>When we build AI products, how can we help people? The answer is not obvious.<<<DELIMITER>>>I find great pleasure in working on hard tasks, helping others, creating better society, and learning new things.<<<DELIMITER>>>I don't like digital products that are designed to addict people.<<<DELIMITER>>>People may have different values, but following a guiding principle can make the decisions easier:. Do machine learning projects please others? (This is not an exhaustive question to ask).<<<DELIMITER>>>Recent advances in machine learning have led to significant improvements in supervised learning algorithms that consistently deliver high-quality performance on standardized test data.<<<DELIMITER>>>Getting high accuracy on your test set is important, but it's not the only factor that matters.<<<DELIMITER>>>In our discussions, what do you think teammates with different backgrounds have in common? Machine learning engineers may tell you how well your models perform on test sets, and product managers may tell you what practical applications this will lead to? A simple But may suffice as a response from the engineer, pointing out that the engineer may have different perspectives.<<<DELIMITER>>>It performed well on the test set, but other factors are also important to consider:. Robustness and generalization: In practical deployment scenarios, performance degrades either by concept shift or by data shift (which occurs when the relationship between input (x) and output (y) changes, such as in regression work like predicting housing prices; rising inflation can lead to rising prices; and rising data shift is caused by changes in the distribution of input data.<<<DELIMITER>>>This type of data drift is related to models' ability to generalize correctly to classes that are few or missing in training.<<<DELIMITER>>>For example, an ASR might achieve high accuracy if, despite having different accents in its training and evaluation data, it can't understand UK speakers.<<<DELIMITER>>>If the product is successful in the U.K.<<<DELIMITER>>>And as the number of English speakers increases, the accuracy will diminish.<<<DELIMITER>>>A robust system outperforms a non-test. Even if the test accuracy is high, if a system lacks examples, it may have implications for approval or adoption.<<<DELIMITER>>>For example, a user might tolerate falsehoods when a search engine fails to deliver the best possible result for informational or transactional searches such as apple pie recipe or wireless data plan.. But if they search for Stanford,YouTube, or Reddit, they expect the right returns and will lose trust if they aren't given the right URL.<<<DELIMITER>>>So, while weighting test examples based on their importance can reduce the pressure, it's less likely to work in practice.<<<DELIMITER>>>Then the system's performance must be able to predict with confidence whether the intended borrower will pay back the loan and whether the lender will approve the loan application.<<<DELIMITER>>>While the accuracy may be high, a bias in performance for applications submitted by a particular minority group should be a wake-up call that we should be extra wary of widespread deployment.<<<DELIMITER>>>But the need to remain neutral is much bigger than that.<<<DELIMITER>>>For example:. If an ecommerce site recommends a product, how should it do so? It should prompt recommendations from a large number of vendors (and smaller ones as well) so that users have a more diverse and inclusive experience.<<<DELIMITER>>>In this case, poor performance in a particular subset such as ethnic groups or classes of sellers makes an otherwise accurate model unsuitable for a high success rate.<<<DELIMITER>>>I would like to see more product managers and engineers doing their best to understand how their AI system works. . When a product manager says the system isn't working on a particular application, I think it's important for my team to understand that our goal shouldn't be just to get a high average test accuracy, but rather to make sure the system actually addresses that problem.<<<DELIMITER>>>This will likely take visualizations, more data, algorithmic improvements, performance assessments, and even more complex deployment scenarios, all of which will involve human attention.<<<DELIMITER>>>Two pictures of the same equipment, taken in two different environments.<<<DELIMITER>>>Computationally and perceptually, the image at the right is more likely to spot a defect in the gear because of its training configuration. This case shows the value of a data-centric AI approach.<<<DELIMITER>>>To make a neural network more efficient, the data you're processing improves in a fraction of the effort you'd do to rearrange the network.<<<DELIMITER>>>Often, you might tweak the imaging configuration and compare in-person observations with those captured.If you can visually spot a defect in a photo taken from that angle but don't recognize it clearly, you might want to upgrade your imaging configuration.<<<DELIMITER>>>The options include lighting (lighting/camera/lighting/lighting/light/light/light angle/light/image/image/image/image/image/image/image/image/image/image/image/image/image/image/image/image/image/image/image/image/image/image/image/image/image/image/image/image/image/image/image/image/image/image/image/image/image/image/image/image/image/image/image/image/image/image/image/image/image/image<<<DELIMITER>>>A defective item, such as a drinking glass or touchscreen, may be visible under certain lights but not others.<<<DELIMITER>>>A camera's performance is compromised when it is subjected to motion or vibration, which results in inconsistent images. . A camera's need for a certain resolution (that is, enough pixels covering a given area to capture the features it was intended to capture) or for focus, contrast, and time play a large role in revealing or hiding key features.<<<DELIMITER>>>Are your sights and objectives clearly visible? Have you implemented contrast and visual representations that make them easy to spot? Deep learning has been useful for situations with diverse data types, such as crowded public spaces or open spaces (say, a concert hall versus a campsite), but smaller datasets make computer vision easier.<<<DELIMITER>>>For example, if we try to use visual evidence to identify diseased plants, then deep learning will work best if we have large amounts of picture data showing plants photographed at all angles and under all kinds of lights.<<<DELIMITER>>>If all the pictures are uniformly shot, with normal lighting, image processing can be made simpler.<<<DELIMITER>>>In practice, this means the model will be more accurate and/or work well on far fewer data.<<<DELIMITER>>>With a consistent dataset, I've found that a neural network can be trained to perform valuable tasks using only 50 images per class far less than the ideal 5,000. . This skill is especially relevant when robotics engineers tend to stress the importance of engineering complex imaging systems as well as audio and other sensor technology.<<<DELIMITER>>>It may also benefit machine learning engineers who want to build functional computer vision systems. In a recent letter by David Dechow, I explored the concept in more detail.<<<DELIMITER>>>This article focuses mainly on manufacturing applications, but it applies to a wide range of computer vision projects in which the imaging configuration can be manipulated.<<<DELIMITER>>>Last week, GameStop's stock price exploded.<<<DELIMITER>>>The stock's surge was a joke: Individual, tech-savvy investors joined the internet to drive the price up and challenge the largest hedge funds that had bet on the company.<<<DELIMITER>>>But the situation on the ground forces them to bette. Several retail investors in GameStop made a profit in the past year.<<<DELIMITER>>>Automated trades, which are driven by AI, are faster and more capable than human traders.<<<DELIMITER>>>I believe that stock prices such as those experienced during the GameStop event suddenly move from retail investors to highly skilled AI-powered hedge funds that use AI to fund their investment using datasets, from financial indicators to social media.<<<DELIMITER>>>Instead, institutional investors have much more information than a typical retail investor who walks into a wall street betting site like Reddit.<<<DELIMITER>>>They are good at processing natural language and financial calculation, so they can interpret huge amounts of data.<<<DELIMITER>>>Consequently, no human trader is likely to beat an AI system at best. Just as a good programmer on an Atari game can't beat a good reinforcement learning algorithm, a human can't beat an AI-driven trading strategy. I'm not sure how to distinguish between commodities and investments.<<<DELIMITER>>>Investors who own stock based on their sense of value can make huge returns.<<<DELIMITER>>>Making investments in companies that are good ones can make them bigger and better for everyone.<<<DELIMITER>>>This is why I believe that investing money is different than investing in stocks (where the primary goal is to hold them until they become more valuable).<<<DELIMITER>>>In practice, this trade leads to very little, if any, net wealth growth.<<<DELIMITER>>>Why should we do this? Given the large number of possibilities to enlarge the economic pie, why bother? In a recent letter to The Washington Post by Helaine Olen, I understand that there's a bigger problem than getting rich quick schemes at the heart of GameStop's rising stock price.<<<DELIMITER>>>It goes to show that many young people are looking for a job because they are missing a place.<<<DELIMITER>>>I'm glad that some traders who made a great investment in GameStop plan to re-invest their investment for their own good.<<<DELIMITER>>>But I worry about people who might lose their lives by taking part in a potentially difficult task.<<<DELIMITER>>>For example, people who bought stocks at GameStop's peak on January 27 might lose massive amounts of money they're not ready to handle. When it comes to picking AI projects, I hope you'll pick ones that bring you fulfillment and benefit the world.<<<DELIMITER>>>We also need to make sure the wealth we generate is shared fairly and openly.<<<DELIMITER>>>In recent weeks, I wrote about the importance of clear and consistent label convention (as well as how using HLP to evaluate whether the labeling guidelines are necessitable to be updated). I also noted that, for training data that has been marked at random, an AI model that surpasses HLP on the test set may not actually perform better on a human scale.<<<DELIMITER>>>When humans transcribe an audio clip and label the same speech disfluency as um a common filler word in English speaking cultures,<<<DELIMITER>>>For example, a high filler like um (in the UK, erm) might indicate that you think your partner is a lower-level linguist.<<<DELIMITER>>>For all intents and purposes, at least 58 percent of the ratings given by a random pair of labelers align (a mean based on two different ratings: roughly 0.72 or roughly a f = 0.72).<<<DELIMITER>>>A more nuanced approach, such that the model repeatedly chose um, would work well with the labels it has received, because human labels tell us it's accurate 70 percent of the time.<<<DELIMITER>>>Even if their training data isn't necessarily superior, AI models can beat them even if the data doesn't. This is because training data is important when working with small amounts of data.<<<DELIMITER>>>Combined technologies like generative adversarial networks, data augmentation, transfer learning, and self-supervised training are making a huge difference in what little data can do.<<<DELIMITER>>>When I need 1,000 data points, the first thing I want to do is to make sure all labels are consistently used. . Based on last week's case study, which looked at how to tell if a scratch is significant by length, I'm still trying to figure out how to classify scratches in the data.<<<DELIMITER>>>If the labels contain noise for example, a labeler would need a lot of data to compute the best value (the left-hand graph in the above example).<<<DELIMITER>>>But if the data were pristine and all labelers agreed on the right length of the shift from 0 to 1, then the best path would be obvious. Theoretical knowledge: When you label data consistently, you need far fewer examples.<<<DELIMITER>>>In this simple example, the error rate is 1 minus the square root of the size of the training set on one side and 1 minus the size of the training set on the other, where m is the number of examples in the learning set.<<<DELIMITER>>>In contrast, errors diminish much more quickly when the label is consistent, so the algorithm can start with fewer training examples. . Use of clean labels tends to be beneficial.<<<DELIMITER>>>Generally speaking, the data of 1 million examples would require less stringent labeling requirements, since the algorithm could average any number of examples in a given class.<<<DELIMITER>>>Thus, it can be hard to edit many labels especially when the scale difference is 1,000-1 million and it can be complex and time-consuming.<<<DELIMITER>>>Clear labels are great for any machine learning task, but they're especially useful for small data.<<<DELIMITER>>>Last week, the European Union rumored a 3-5-year suspension in pubs for face recognition.<<<DELIMITER>>>While face recognition is a controversial technology, I'm grateful that the European Union is protecting human rights and civil society.<<<DELIMITER>>>But it's not a good idea to suggest a five-year pause in development, which would be an undue burden especially given the pace of progress. If it were implemented, it would impede the European Union's competitiveness relative to that of the U.S., China, and other nations, which would be doomed. Fortunately, face recognition is still in its infancy and the challenges ahead are still in their infancy.<<<DELIMITER>>>While various teams have made significant progress toward high accuracy on face recognition benchmarks like LFW, the field is still young and in progress.<<<DELIMITER>>>Open source software makes it easier to recognize faces in still images taken with the front camera, but challenges remain; among them, multi-camera tracking, re-identity when people exit and re-enter the field of view, and systems that adapt when a camera turns off or needs a re-calibration.<<<DELIMITER>>>This capability is likely to improve rapidly over the next few years. Countries that have a foundation in developing this technology will probably outperform others.<<<DELIMITER>>>Would it be a paradox if the European Union, which has stalled development of face recognition systems, eventually bought them from the U.S. and China? The Universal Declaration of Human Rights is a landmark document in the field of human rights that inspires individuals all over the world.<<<DELIMITER>>>Making good regulations is hard work.<<<DELIMITER>>>We need to make sure that society that values individual privacy doesn't fall behind in the technology arena because it lives up to these precious principles.<<<DELIMITER>>>Rather than slow down their progress, let's encourage them to push those values forward and move toward our goals together..<<<DELIMITER>>>As I visited a nearby homeless camp over the weekend, I thought I was speaking to two people in the same apartment: one person was speaking to me directly and another in the same apartment, but I was not watching.<<<DELIMITER>>>I also spoke with a 21-year-old woman who escaped her abusive childhood home and asked her to sleep in a tent that would have protected her from the elements. . I'm forced every day to reflect on what I'm privileged to do: have food, a place to stay, and a wide range of connections. I have a responsibility to help the homeless. I don't think we have a strategic plan for helping the growing homeless population.<<<DELIMITER>>>I still have an open mind for that and am determined to find a solution.<<<DELIMITER>>>While developing products and tools that connect people, I also want to understand how to make them available to all the different groups of people I met last week.<<<DELIMITER>>>Sometimes, I find reading a research article motivated enough to applaud the authors and tell them how grateful I am for their work.<<<DELIMITER>>>But I didn't feel this until I finished reading DPO (direct preference optimization, a book co-authored by Rafael Rafailov, Archit Sharma, Eric Mitchell, Stefano Ermon, Christopher Manning, and Chelsea Finn).<<<DELIMITER>>>Reading it in a busy coffee shop, I couldn't applaud openly that might have been obvious.. This fascinating work offers an alternative to RLHF that makes it much easier to tune language models to human preferences.<<<DELIMITER>>>In addition, I've often been asked whether an institution of higher learning which doesn't have the computational richness of a large tech company can carry out groundbreaking work on LLMs.<<<DELIMITER>>>Clearly, the answer is yes. . This article illustrates the striking relationship between algorithmic and mathematical knowledge, as reflected in the thoughtful approach taken by a group of researchers. RLHF became a household name in LLM training as a direct result of the work of the InstructGPT paper, which adapted the technique to develop LLMs.<<<DELIMITER>>>A common implementation of the algorithm is simple: Human judges compare LLM-samples produced using the same input query against pairs to determine their preferences.<<<DELIMITER>>>For example:. Humans prefer the more informative and non-toxic output. Using this intuition to understand human preferences can enable us to develop a reward function.<<<DELIMITER>>>The reward function, modeled using a transformer architecture, aims to assign higher values to outputs that human judges found to be acceptable. Then, using this learned reward signal, we can use reinforcement learning to train the LLM so (i) we can optimize its response and (ii) limit its change potential that is, it becomes more regularized. . Consider this algorithm.<<<DELIMITER>>>We have to generate very different representations of the reward function and LLM.<<<DELIMITER>>>It's also widely acknowledged that the last step of reinforcement learning is hyperparametric, but DPO significantly increases the speed with which we choose hyperparameters.<<<DELIMITER>>>Rather than requiring two different neural network architectures one for the reward function and another for the large language model (LLM) the authors show how to infer the reward function that an existing LLM is optimized to maximize based on the regularization term.<<<DELIMITER>>>Combining the transformer networks creates a single model.<<<DELIMITER>>>Therefore, you just need to train the LLM and don't need to interact with a separate reward function.<<<DELIMITER>>>DPO trains the LLM to ensure that its reward function matches what humans want.<<<DELIMITER>>>Further, the authors show that DPO outperforms many traditional implementations of RLHF in achieving its goal (i) and (ii). . RLHF is an important part of the most complex LLMs.<<<DELIMITER>>>Stanford researchers, drawing on their mathematical background, developed a simpler alternative to a more efficient one.<<<DELIMITER>>>I approach this new paper with fervent enthusiasm, recognizing that its value can be fully appreciated only after having spent time debating and analyzing it, and foreseeing significant benefits for DPO in the future LLMs and beyond.<<<DELIMITER>>>Already it's incorporated into the latest language models, including Mistral's Mixtral. Why it matters: The field is still in its infancy, and lots of room to grow.<<<DELIMITER>>>Also, as you can read below, don't need lots of GPUs to do impactful research. It goes to show that deep thinking and problem solving work well with very few compute options. The recent NeurIPS conference highlighted the variety of creative work from academic labs, independent researchers, and companies of all sizes. Unfortunately, I'm not surprised that many researchers are paying attention to papers published by large tech companies while ignoring many equally important contributions from others.<<<DELIMITER>>>If the DPO study had been published by a large LLM company, I imagine that it would have received widespread media coverage and been heralded as a breakthrough in computational technology.<<<DELIMITER>>>Let's share common ground and celebrate the progress we've made in AI.<<<DELIMITER>>>Beyond generating huge value, AI concentrated massive power in a few hands.<<<DELIMITER>>>Our community is wrestling with fair use, and that applies to it. . The Markup article highlights an issue of car insurance giant Allstate's pricing strategy, which has come under fire for price discrimination. The company charges different premiums to different customers based not just on their level of risk but also on their ability to pay.<<<DELIMITER>>>Is this generally okay? Digital pricing is making it easier to compare prices online, and purchasing power has shifted from suppliers to consumers.<<<DELIMITER>>>Not only does this new technology enable companies to create products specific to individual customers, it also enables them to build bespoke services like point A to point B shuttle service at a particular time or health insurance plans tailored to the customer's medical history. AI helps automate pricing and maximizes salespeople's bottom line.<<<DELIMITER>>>The result can be either better quality products at more transparent prices or vice versa. Customers will flip when an ecommerce platform offers the same product at different prices to different customers.<<<DELIMITER>>>This helps to stem the tide of price discrimination.<<<DELIMITER>>>Why are vendors incentivized to offer low introductory prices is still an important factor.<<<DELIMITER>>>In a 2016 study by Uber, researchers found that consumers charge higher rates when their mobile phones have low battery life.<<<DELIMITER>>>The company said it didn't take advantage of the opportunity, but I occasionally wonder if I should do that more often.<<<DELIMITER>>>Not because I'd like to save a few bucks on one purchase, but because I want to educate vendors' AI systems that I'm sensitive to price, so they can offer cheaper prices. An economics course I took in college taught me how the market can create surpluses for producers and consumers.<<<DELIMITER>>>But this leads to important debates about obsolete models and our theory of fairness.This is a complex matter that deserves careful consideration.<<<DELIMITER>>>I hope we can work together to provide high-quality products and services at reasonable prices..<<<DELIMITER>>>You trained an accurate neural network using a Jupyter notebook.<<<DELIMITER>>>What can we expect? The emergence of machine learning engineering in production is a step toward building models into practical applications, and this excitement has excited engineers and teams that aim to deploy models to users.What's new: DeepLearning.AI released Machine Learning Engineering for Production Specialization (MLOps).<<<DELIMITER>>>I teach this specialization in collaboration with my Google teammates Robert Crowe and Laurence Moroney.<<<DELIMITER>>>In addition, Landing AI has worked with numerous companies in a variety of industries. Machine learning models are evolving rapidly toward standardized approaches, in which the systems are built and deployed in a consistent way.<<<DELIMITER>>>This specialization will put you at the cutting edge of that movement. . I remember managing code versioning by emailing C++ files to colleagues as attachments with a simple message, I did my work, feel free to edit. This was slow and error-prone.<<<DELIMITER>>>tone: informal/ conversational, regional style = thank you modified tone: polished/ general English. However, with the help of versioning tools and best practices, everyone is working more collaboratively on codin.<<<DELIMITER>>>I remember designing neural network architectures in C++ or Python, starting with DISTBELIEF, an early prototype that would go on to become TensorFlow.<<<DELIMITER>>>Today, tools like TensorFlow and PyTorch have made building neural networks much faster, but building and deploying systems that are ready for production still requires a lot of effort and effort.<<<DELIMITER>>>Issues such as data integrity, accuracy, and relevance whether the dataset is defective, what happens when an underlying situation changes overseeing model development, performing test sets, deploying models to a real-world environment, and managing computation to optimize processing power have become ever more important.<<<DELIMITER>>>MLOs are tools that help build, deploy, monitor, and maintain machine learning models at scale.<<<DELIMITER>>>Just as versions management and frameworks like TensorFlow and PyTorch have made building models simpler faster via Git, so MLOps tools that are emerging will have a huge impact on productivity in machine learning.Taking this course was an unusual time for me.<<<DELIMITER>>>MLOps tools and standards are still evolving, and it's exciting to share ideas about where things stand.<<<DELIMITER>>>You will also find the cutting-edge aspects of machine learning development fascinating. I also hope you gain experience that enables you to build and deploy machine learning systems.<<<DELIMITER>>>Is AI sentient or conscious? This is a question of philosophy rather than science, because there is no widely agreed definition or test set.<<<DELIMITER>>>While it may seem like it's tempting to define specific benchmarks for how we measure AI, I worry that hasty deployments will lead to overly optimistic claims and wildly hyped coverage. . The idea of self-awareness that encompasses an individual's conscious acknowledgment of their own identity raises questions about what is consciousness and how to assess it.<<<DELIMITER>>>For example, let's say a robot is conscious when it recognizes itself that's really how we know if it has.<<<DELIMITER>>>But the definition is important: If a robot can sparkly learn to recognize itself, it will be able to hype AI's state of consciousness. This argument is not intended to predict what will happen.<<<DELIMITER>>>It was 10 years ago, when the mass media reported work that showed that a robot could pass the mirror test and that the authors thought it was conscious. Clearly, ambiguous language is essential to scientific progress, but it's not always easy: Many people already have deep-rooted beliefs about what it means to be conscious, sentient, conscious, or soulful.<<<DELIMITER>>>No consensus on the terminology is emerging.<<<DELIMITER>>>For example:. Do all living things have souls? This raises questions about organisms such as bacteria or viruses. Even if a scientific revision is proposed and widely accepted, many people who have not been informed by this revision will continue to use the term according to their existing understanding.<<<DELIMITER>>>But when the news media says that AI is superior to human intelligence, these statements are often specific and limited in scope and not general intelligence (say, an AI system that passes a test to determine whether it is conscious), but it still falls short of the intricate, multifaceted definition of human cognition.<<<DELIMITER>>>But the public will generally assume that AI achieved sentience because of what humans commonly see as this concept. Thus I'm ambivalent about the task of redefining AI general intelligence.<<<DELIMITER>>>I believe the current view about AGI is that it refers to an AI system that can perform any mental task that humans want it to do.<<<DELIMITER>>>It will not take many decades to achieve AGI.<<<DELIMITER>>>I can't deny that this may be a tempting solution, but changing the word AGI to lower the bar might make it seem like a very simple success story.<<<DELIMITER>>>I've noticed that the growing trend of generative AI applications has implications for deployment, affecting both large companies and individual developers. . Data is playing a less central role in designing generative AI applications.<<<DELIMITER>>>Data gravity was introduced by IT researcher Dave McCrory in 2010 to explain that the presence of data attracts more people to create or carry out data.<<<DELIMITER>>>The concept of data gravity is a powerful concept in traditional software.<<<DELIMITER>>>It's convenient to have large datasets in the cloud, but the high cost of moving data to a different location is a major downside.<<<DELIMITER>>>Many companies pick from AWS, Azure, or Google Cloud Platform and base their business on it.But for many generative AI applications, the cost of processing far outstrips the cost of transmission.<<<DELIMITER>>>This makes data gravity less important because the web server or data center connected to the cloud server or server is less easily connected, and it's more costly to build a system that transmits packets between multiple servers across the web; say, 1GB of data might cost $0.10.<<<DELIMITER>>>A 1GB standard costs around $125 per input token, with an average of 4 characters per input token and around 250 million total tokens, compared to the relatively inexpensive gpt-3.5-turbo-0125.<<<DELIMITER>>>But with gpt-4-0125-pretraining, the data costs would be 20x more than the transmission costs.<<<DELIMITER>>>In addition, the large computational amount of LLMs that need to process token inputs and generate output data can result in significant latency but that's usually longer than the amount of time it takes for data to travel the web. So, even if a piece of software is based on a particular cloud provider's infrastructure, it's still practical to send an LLM prompt to external parties like OpenAI, Anthropic, Anyscale, or.ai or large cloud providers like AWS, Azure, and Google Cloud to get feedback.<<<DELIMITER>>>There may not be an incentive anymore to focus solely on a single, cloud-centric infrastructure.<<<DELIMITER>>>This has implications for a wide range of stakeholders:. As a developer, we assemble complex AI applications from dozens of Software-as-a-Service providers across the internet, integrating our services seamlessly. CIOs are challenged to manage data distribution and maintain accurate lists of vendors. . Generative AI is transforming the nature of competition for large cloud companies, since the cloud's workloads of customers now encompass abilities that look little like the traditional software applications. We're thinking: We're thinking: While building a new<<<DELIMITER>>>My take-away relates to generative AI: The competitive landscape for traditional software components has largely remained the same. With new AI component architectures available, we're likely to see shifts in the market forces and industry's gravitation.<<<DELIMITER>>>For example, switching from one large language model (LLM) provider to another is relatively easy these days.Although switching from one LLM to a new one can be challenging, there's no end in sight.<<<DELIMITER>>>In contrast, switching from one vector database to another is much harder. . Once you've stored a lot of data in a vector database, how can you navigate it to a different system? Data management has always been a core part of cloud computing, and data gravity is a significant driver of competition in companies.<<<DELIMITER>>>Decreased data costs are an exciting time for developers and businesses.<<<DELIMITER>>>My father completed 146 online courses over 10 years.<<<DELIMITER>>>His academic interests have ranged from creative writing to complexity theory. One of the highlights of life-long learning is the work of Ronald Ng.<<<DELIMITER>>>To him, learning is a must-have pursuit.<<<DELIMITER>>>He says, The pleasure of learning helps you stay sharp and understand the value of complex topics.<<<DELIMITER>>>He continues, We need to preserve his childhood attitude, and his decision to take online courses is not a hobby, but a desire to learn.<<<DELIMITER>>>At age 74, he still works as a hematologist and an adjunct mediator at court. . You never know when or how much knowledge is valuable.<<<DELIMITER>>>My father was a doctor when he treated a patient who was convinced he had eaten mercury.<<<DELIMITER>>>The authors didn't report any presence in the patient's blood.<<<DELIMITER>>>He told me about a course in forensic medicine at Nanyang Technological University, where he learned that mercury builds up in hair.<<<DELIMITER>>>A hair sample collected from the patient revealed the toxic metal.<<<DELIMITER>>>Having learned this, my father was able to care for her. When I was younger, my father was a violinist with the Hong Kong Philharmonic and an avid astronomer who watched the stars through a telescope he had placed on the rooftop of our apartment complex.<<<DELIMITER>>>He shared a lesson he learned as a military volunteer when he learned a principle that far surpasses military medicine and leadership: We need very little in life to be happy if we really appreciate what we have.<<<DELIMITER>>>I hope you will continue to learn his wonderful life story even after age 74 and beyond!.<<<DELIMITER>>>The proliferation of cloud-based AI software has led to worries about privacy and privacy.<<<DELIMITER>>>But I've noticed that users the users and developers who use it lack a robust framework for understanding how software vendors manage, use, and distribute their data.<<<DELIMITER>>>For example, does a company's promise, don't process customer data, mean absolute privacy with respect to your data? We need to establish different standards of privacy for what cloud services deliver. Here's how we can judge different degrees of privacy:. 1. **No guarantees**: In this scenario, companies state explicitly that they have no responsibility for protecting users' privacy.<<<DELIMITER>>>For example, a reputable AI company may inadvertently use machine learning models based on user data if they aren't properly maintained.<<<DELIMITER>>>Many startups start up in this area, but they don't prioritize privacy until they realize a customer need. Please let me know if you're not seeing anyone else get their data.<<<DELIMITER>>>To comply, a company must ensure that no one is allowed to use your data for training or publish it online in a language other than English.<<<DELIMITER>>>Many large startups (and companies that have built large language models) now follow this path. We also have a limited set of safeguards against data leakage: Human review of your data will take place only when, for example, you receive a lawful court order or other request, unless your data is flagged automatically by our system for safety.<<<DELIMITER>>>In practice, many prominent cloud providers are trusted with protecting user privacy even when their terms of use don't mandate it. Unfettered access: The company has full control over your data and access to it at all times.<<<DELIMITER>>>For example:. We're thinking: Our customers store data inside our facilities, and our company has no access to it.<<<DELIMITER>>>If you run an LLM on your device, it's completely within your control, and you can't tell anyone else what you want to do, because neither the company nor anyone else has access to the data or model outputs generated on your device.<<<DELIMITER>>>In contrast, if sensitive data leaves a customer's premises, the data can be encrypted, and the cloud provider has no data left open for the customer to access.<<<DELIMITER>>>For example, while end-to-end encrypted messaging platforms like Signal and WhatsApp allow users to maintain privacy, the company that hosts the message can't access the message's contents, and does receive metadata (sender/receiver details), as well as message timing and size, these differences can be nuanced. . At each level, there are subtleties that can affect a person's understanding of privacy.<<<DELIMITER>>>In practice, a promise not to use your data for training can mean different things in different organizations.<<<DELIMITER>>>However, some generative AI models especially image generators can take a photo and generate a copy of their training data. Thus, training a generative AI algorithm on customer data could lead to leakage of sensitive data.<<<DELIMITER>>>However, tuning a small number of parameters (such as the machine learning rate) on a particular customer data layer doesn't compromise data privacy.<<<DELIMITER>>>How the data is used in training affects the risk of data leakage.<<<DELIMITER>>>The Limited Access level is also complex.<<<DELIMITER>>>When a company prioritizes user privacy, it's important to understand when its employees can view customer data.<<<DELIMITER>>>Even with careful precautions, privacy issues can still be challenging.<<<DELIMITER>>>For example, if a restricted number of people in a secure location have access to abbreviated data snippets that are not connected to their company identifier, this creates greater security relative to having multiple employees browse corporate data. Privacy.<<<DELIMITER>>>Trusting a company to protect its data is trusting the safety and trustworthiness of its IT infrastructure. . The rise of cloud-based SaaS has taken a step in recent years.<<<DELIMITER>>>However, some customers are still willing to set up infrastructure on site in their data center.<<<DELIMITER>>>One reason for this discrepancy is that many SaaS providers don't provide enough assurance about privacy and privacy, but many customers have data that's so sensitive that it's hard to access. . How can we address these issues? I would like to see a more sophisticated approach for our industry that addresses privacy and clearly communicates the guarantees and limitations that providers provide. Data privacy is becoming more important for regulators globally.<<<DELIMITER>>>For example, if a jurisdiction shifts the definition of a child from 13 to 18 years old, then the data associated with 13 to 18-year-olds might need to change, but the frequency and complexity of such changes makes it difficult to track manually. I've been pleased to see AI working in this area.<<<DELIMITER>>>Li, CEO of Commonsense Privacy (a portfolio company of AI Fund), leverages large language models to help companies scale their privacy decisions and monitor global regulations.<<<DELIMITER>>>I hope the themes in my recent TED talk on AI isn't the problem, it's the solution prevail and address the question of the relationship between AI and privacy.<<<DELIMITER>>>Last week, I described four design patterns for AI-assisted workflows that are poised to set a new direction for 2023: reflection, tool use, strategic planning, and multi-agent interactions.<<<DELIMITER>>>Rather than having an LLM generate the output in one pass, an alternative workflow requires prompting the LLM repeatedly to improve over time.<<<DELIMITER>>>In this letter, I'd like to discuss reflection.<<<DELIMITER>>>For example, certain design patterns that are relatively easy to implement can bring about significant performance improvements for example, with an LLM like ChatGPT or Claude and sometimes the initial output may require tuning, so you can give the LLM a constructive feedback to help it get better performance.<<<DELIMITER>>>If we were to automate critical feedback, we would be able to criticize our own output in real time and then improve its own output accordingly. . This is the idea behind Reflection. Say you have an LLM writing code.<<<DELIMITER>>>The system can generate a particular piece of code automatically to carry out a particular task.<<<DELIMITER>>>After that, we can prompt the model to reflect on its own output. . Here's a sample code for task X:. [code previously generated by the LLM] Review the code carefully to make sure it meets standards of correctness, style, and performance. Feedback. In some cases, this helps the model to think through problems and give thoughtful suggestions for improvement.<<<DELIMITER>>>Then we can give the text-to-text model a contextual input of (i) the generated code snippet and (ii) an LLM's need to use that feedback to rewrite the original code.<<<DELIMITER>>>This can lead to a better outcome.<<<DELIMITER>>>If you're doing the critiquing or rewriting over and over, this might be good for you to re-write the critique and/or re-write the words correctly.<<<DELIMITER>>>This helps the LLM identify areas to improve on and improve its performance on common tasks such as coding, writing, answering queries, and provide the LLM with evaluation tools to help it evaluate its output more accurately, such as testing its output to make sure the generated code is correct or using the web to validate a piece of writing.<<<DELIMITER>>>It will also allow the system to examine its own performance and identify areas for improvement. Or consider using a multi-agent framework to include reflection as a capability.<<<DELIMITER>>>I've found it useful to implement two agents, one to generate output and another to provide feedback on the first agent.<<<DELIMITER>>>The ensuing dialogue between the agents yielded stronger responses. I was pleased to see a handful of improvements as reflection was a key part of the agent workflow.<<<DELIMITER>>>I hope you'll apply this approach to your practice..<<<DELIMITER>>>If you're interested in learning more about reflection, I encourage you to read the following research papers:. Self-Refine: Refinement with Self-Feedback, Madaan et al.<<<DELIMITER>>>provided to change, as if it were a citation or publication citation formatted in an accurate literature.<<<DELIMITER>>>(2023). Critical of LLMs' ability to correct themselves when prompting tool-interactive critiques has been the brainstorming work of Gou and colleagues.<<<DELIMITER>>>We'll find more agentic design patterns in 2024 in more letters..<<<DELIMITER>>>For the past decade, I've spent my mother's birthday in December away from home, visiting either Singapore or Hong Kong in the middle of the week.<<<DELIMITER>>>This year, we conducted the video conferencing interview on Zoom.<<<DELIMITER>>>Although we were physically far apart, it was heart-warming to hear family members in the United States, Singapore, Hong Kong, and New Zealand join me in a rousing, if somewhat misplaced, rendition of Happy Birthday to You!. I wish I could have been there on a Zoom call with all of you to wish you a wonderful holiday season and even a brighter New Year. . Every holiday season, I think about the most important people in my life, remember their kindness, and tell them my gratitude.<<<DELIMITER>>>I feel a closer connection with them.<<<DELIMITER>>>If we celebrate holidays separately, why not reflect on the people you care about so much? Why are they important to you? This is a great time to build relationships, even if you're only with one person or with a long distance. I hope you'll find good things to do in this life..<<<DELIMITER>>>Indeed, AI systems are concerned about bias.<<<DELIMITER>>>For example, if a judge is passing sentence to a defendant and is using an AI system that routinely generates higher rates of recidivism for a particular race of defendants, it does raise serious issues.<<<DELIMITER>>>At the same time, consider consistency as another important factor in improving AI models.<<<DELIMITER>>>Let's examine how human judgment is inconsistently determined and how to address the differences. If, for example, two people were found guilty of the same crime by a judge and sentenced to 30 days in prison or three years in prison, it would be unfair.<<<DELIMITER>>>But human judgment and decision making are very inconsistent, as studies show. Noise: A Flaw in Human Judgment, by Daniel Kahneman and co-authors provides compelling evidence of this inconsistency. . A study found that judges were more likely to sentence defendants to a higher standard of punishment when their local football team had lost an unexpected bette, suggesting that the judge's mood may have been affected by the event.<<<DELIMITER>>>Jurors, who are made up of simple people, can be tricked by aliens. If the outcome of football plays a role in their judgment, then other trivial factors can influence that decision as well.Most human decisions are made according to many criteria, and we don't necessarily specify these criteria clearly before we evaluate them.<<<DELIMITER>>>For example, in medical settings, I've seen doctors make inconsistent diagnosis with similar patient data.<<<DELIMITER>>>In one study, my friend Alice learned that the mean time of diagnosis varies depending on the severity of the problem how likely are her diagnosis preferences to vary from morning to afternoon.<<<DELIMITER>>>To be clear, Alice is a wonderful doctor and friend.<<<DELIMITER>>>This score outstripped her ability to do the job well. Experienced manufacturing experts in many companies, I've seen inspectors disagree on whether parts with similar flaws are problematic, and human labelers have found inconsistent practices for marking products online.<<<DELIMITER>>>This can be tricky when deciding whether a particular item should be classified as an electronic device or an item of entertainment (an interpretation of which is often a matter of choice). A trained neural network will always generate the same output for the same input.<<<DELIMITER>>>Similarly, given the same inputs, our trained models often get the same result.<<<DELIMITER>>>Automated software is highly consistent.<<<DELIMITER>>>One of the many benefits of automation is that algorithms can make better decisions than humans can.<<<DELIMITER>>>I believe they can give patients more consistently and fairly treated options, automate manufacturing, reduce consumer retail product listings, and so on.But when we talk about building an AI system, it's important to think about how to limit bias in the system and how to set objective benchmarks for measuring its performance relative to human decisions.<<<DELIMITER>>>If you're applying for approval of an AI project, keep in mind the importance of consistency.<<<DELIMITER>>>If you can compare your algorithm's safety with that of human decision makers, you can strengthen your case for automating the workaround.<<<DELIMITER>>>Ranking algorithms are often kept secret.<<<DELIMITER>>>Let's examine the pros and cons of letting people know how certain companies work behind the scenes, especially regarding their ranking algorithms.Why are these algorithms confidential? . One possible benefit of keeping the algorithm secret is that it makes it harder for scammers and spammers to manipulate its output.<<<DELIMITER>>>Darkness is not the only defense, but it is a deterrent.<<<DELIMITER>>>The fact remains that open source software is secure, and the fact that we collaboratively build it can identify vulnerabilities and fix them quickly.<<<DELIMITER>>>But I think there's a tradeoff between keeping traditional software free of hackers and keeping a ranking algorithm free of rigorous statistical manipulation.<<<DELIMITER>>>Rather than targeting a live website, which may arouse suspicion among security teams, attackers can repeatedly search an offline copy of the algorithm to identify message formats they're likely to promote. . If the goal is to explain how a learning algorithm works, then publishing it also requires that the data at the heart of its behavior be disclosed both factors contribute equally to its performance.<<<DELIMITER>>>But making Twitter's dataset publicly available is not feasible.<<<DELIMITER>>>One challenge is the sheer volume of data.<<<DELIMITER>>>Further, the company has a duty to protect users' personal data, since the dataset may contain location tracking, interests, and user patterns.If only the code and data were available, the algorithm's behavior would be hard to analyze because, in fact, it's inherently opaque in machine learning. . Proprietary use of algorithms can create a significant edge.<<<DELIMITER>>>Developing Twitter's ranking algorithm was a costly but fruitful investment that set Twitter apart from the rest of the industry.<<<DELIMITER>>>Sharing it to competitors might give them a shot at it. In contrast, making ranking algorithms publicly available would have created a few unique opportunities: The first step would have been for experts and the general public to better understand how these algorithms work, which would have helped them spot issues and helped them assess the provider's neutrality.<<<DELIMITER>>>More scrutiny would likely lead companies to improve their products and build trust among consumers. Given the massive impact these algorithms can have on millions of people and potentially even affect the outcome of democratic elections there's a compelling argument for citizens and governments to have a better understanding of how they work. . Indeed, scoring algorithms is just one way to protect free speech online.<<<DELIMITER>>>Many pundits have panned the claim that Musk's stance on social media regulation is wrong.<<<DELIMITER>>>Unfortunately, other social media platforms have also struggled with toxicity, financial fraud, and spam because they chose not to restrict user posts.<<<DELIMITER>>>Former Reddit CEO Yishan Wong shared his expertise on moderating content on Twitter in a series of tweets. Twitter has been a great platform for the AI community to share ideas, and I have deep admiration for Parag Agrawal and Jack Dorsey, who have led the company through these transformations in the ever-changing social media landscape.<<<DELIMITER>>>ML's ethics, transparency, and accountability team also did research that encouraged people to share their ideas.<<<DELIMITER>>>Twitter's performance has been criticized, but it has opened a market for potential corporate investors such as Musk or private equity firms. Whether or not Musk makes a successful bid, should society benefit from internet companies' exposure to their ranking algorithms? This is a complex matter that deserves more than mere statements about free speech.<<<DELIMITER>>>My instinctual answer is yes:. The benefits of having a poorly constructed set of rules about how we would do anything in code may outstrip the harm that this approach might do.<<<DELIMITER>>>But there are also worrisome questions to be answered:. What can we do to protect these open-source learning algorithms from misuse? And given the sheer amount of time and effort we've invested in developing them, how can we possibly solve the problems that lie behind these intellectual properties? Many open source tools are designed to solve problems of very high quality.<<<DELIMITER>>>I believe that AI agent workflows will be the cornerstone of AI progress this year, more than the next generation of foundation models will be.<<<DELIMITER>>>I believe this is something that AI practitioners should take seriously.<<<DELIMITER>>>Today, we use most LLMs in zero-shot mode, telling the model to generate its final output token by token, without giving it a chance to iterate.<<<DELIMITER>>>This is like having someone write something coherent and free of charge to edit it whenever possible and still get a good result.<<<DELIMITER>>>Despite the challenges, LLMs excel at it. Through an agentic workflow, we can prompt the LLM to review a document many times.<<<DELIMITER>>>For example, you might write consistently, in phases (i) you would need to draft a blueprint to organize your thoughts (ii) you would need to carry out web searches to gather more information (iii) you would need to write an initial draft (iv) you would need to review the draft to identify flaws (ie, unjustified arguments, or unnecessary detail). You would need to revise it several times a year to address these issues and make the pieces more refined. This iterative process is important for producing good writing that takes many factors into<<<DELIMITER>>>Working with AI lets us do it again and again, and it's not uncommon for a repeated way to write something in a single sitting to produce better output than a repeated way of doing it. . Devin posted his interactive demo on social media.<<<DELIMITER>>>My group has published hundreds of articles on AI development that generates computer code.<<<DELIMITER>>>We divided the datasets into groups and focused on a given algorithm's performance relative to the widely used HumanEval coding standard.<<<DELIMITER>>>As you can read below in this example, GPT-3.5 achieved a 48.1 percent accuracy in zero-shot configuration.<<<DELIMITER>>>GPT-4 achieved 67 percent success compared to zero-shot.<<<DELIMITER>>>While the progress between GPT-3.5 and GPT-4 is impressive, it's also eclipsed by the added functionality of adding an iterative agent to the workflow.<<<DELIMITER>>>GPT-3.5 can be used in an agent loop to achieve 95.1 percent accuracy. The rapid rise of open source agent tools and agentic research is creating a time of excitement and excitement, but the breadth and variety of opportunities in this exciting field also creates unique challenges.<<<DELIMITER>>>To understand this, I would like to describe a conceptual framework for classifying design patterns in agentic systems.<<<DELIMITER>>>Many of AI Fund's teams have integrated various design patterns into their applications, and I hope they will be useful to you. The LLM uses self-awareness to improve its performance by evaluating its own output and identifying areas to improve. Tool use: The LLM can take tools like web search functionality, code execution, or other functions to help you collect data, initiate actions, or process information. Planning: The LLM must develop and execute a multi-step plan to accomplish a goal; for example, writing an essay outline, conducting<<<DELIMITER>>>Every year, we are seeing exciting progress in AI.<<<DELIMITER>>>But in the run up to Halloween and the brief but symbolic relationship between the physical world and the supernatural, it becomes clear that the spirits are taking advantage of these conditions (as humans are). . I wrote previously that prompt engineering the art of writing text prompts to get the output you want from an AI model is taking off.<<<DELIMITER>>>The Japanese word for prompt, also means 呪文. Interesting fact: Natural language developer Paul O'Leary McCann wrote that generating an image using models such as DALL·E 2 or Stable Diffusion sounds like spelling, because those programs bring to life artists like Pablo Picasso, who have been dead for decades, and that's okay. Why it matters: AI practitioners in Japan may be onto something important.<<<DELIMITER>>>startup HereAfter AI is building chatbots that look and speak like someone from your past.<<<DELIMITER>>>Our goal is to foster communication between the living and the dead.<<<DELIMITER>>>If I look back on the zombie apocalypse and look back on my experiences seeing zombies in popular culture, I'm less likely to see such a setback. But I'm much more worried about the motives and consequences of certain AI researchers who are building scary creatures.<<<DELIMITER>>>It's hard to imagine the huge amount of research on transformers published so far.<<<DELIMITER>>>All transformers have a multi-head focus.<<<DELIMITER>>>Since when is having multiple heads considered normal? Researchers have integrated multi-head architectures into computers, and the results are getting rave reviews.Of course, there's more than meets the eye with transformers. . This year, too, we've seen progress in learning from masked inputs thanks to new techniques like Masked Autoencoders, MaskGIT, and MaskViT, which deliver impressive performance on challenging tasks.<<<DELIMITER>>>Halloween mask is a metaphor for the principles behind this movement. . How does the phrase trick or treat describe this? Andrew, have you ever peered at a neural network's output and been struck by its strange likeness? AI has great potential from drugs for potentially fatal illnesses to building fascinating factories and wondrous works of art all of which have a darker side that scares.<<<DELIMITER>>>What if machine learning algorithms were designed to generate these deadly compounds? Are business leaders embracing automation more valuable than ever? What does this mean for humanity when competition is at its highest? Will the dark side of AI win? In this issue, we examine the dual nature of AI: its potential to cause harm and facilitate innovation. We face this challenge head-on.<<<DELIMITER>>>AI's future is grim, just like the unknown forces that AI researchers report. . AI researchers are beginning to see seemingly paranormal activity.<<<DELIMITER>>>In AI, the question is: Do glitches cause hallucinations in AI systems, or do these processes evoke an nascent consciousness? Recent research suggests that such systems are conscious.<<<DELIMITER>>>This raises ethical questions about human influence and control over sentient AI.<<<DELIMITER>>>These worries raise disturbing questions about which cognitive architecture might result from eating only online data. . The AI community has seen examples of machine sentience. In February, Ilya Sutskever claimed that large neural networks might exhibit a primitive level of consciousness. Andrej Karpathy (who co-authored the document), Yann LeCun, and Judea Pearl (who criticized the claim), called it unfounded and misleading. . A Google engineer recently said that a chatbot powered by LaMDA, the company's suite<<<DELIMITER>>>He published a collection of conversations between the bots about privacy, individual rights, and fear of shutdown.<<<DELIMITER>>>Google terminated the engineer and threw out the accusations. As sentient AI's future fades, researchers wrote that DALL·E 2 was building a unique language.<<<DELIMITER>>>When prompted to generate a representation of DALL·E 2 in the image, it frequently generates arbitrary combinations of characters.<<<DELIMITER>>>They found that, when the same nonsense data was re-entered into the model, the same output was generated.<<<DELIMITER>>>For example, entering such phrases as apocalyptic verses generated images of birds. In early September, a multimedia artist and an unspecified text-to-image model performed disturbing experiments. The artist found that purposely vague prompts prompting the model's latent boundary created disturbing images a woman with fair skin, brown hair, and thin lips, often accompanied by graphic violence.<<<DELIMITER>>>She goes by the alias Loab. . It appears this is a fake.Most of the stories involved LaMDA's reported self-awareness with some caveats as viewed through the lens of the media.<<<DELIMITER>>>Many observers have speculated that LaMDA's behavior is primarily human-induced, and that DALL·E 2's language potential is largely the product of algorithms and statistics rather than creativity.<<<DELIMITER>>>Models learn by copying the training data, and while some are remarkably accurate, there's no conclusive evidence that they're doing so intentionally, consciously, or with introspection.<<<DELIMITER>>>Whatever its name, undeniably it does have an unsettling presence. . When faced with unexplainable events, humans routinely call upon fantastic exes.<<<DELIMITER>>>Science doesn't yet have a robust way to test whether a computer is sentient.<<<DELIMITER>>>Until we get a lot of evidence, let's take the claims of machine sentience or consciousness with a healthy dose of skepticism. Building a truly great AI requires significant hardware development.<<<DELIMITER>>>But the threat to global availability of high-end AI chips arises if chip production and supply chains are disrupted.Why it matters: Almost all processors are made in Taiwan, where China has wrestled with China.<<<DELIMITER>>>Most of the chips were built by U.S. companies, which prevents China from getting them.<<<DELIMITER>>>That may lead China to retaliate by severing its economic relationship with the U.S.<<<DELIMITER>>>access to Taiwan's manufacturing facilities.<<<DELIMITER>>>War could lead to human suffering.<<<DELIMITER>>>Of course, we shouldn't underestimate the risk. It could have huge implications for AI, and I'm concerned that it would impede progress in this important field. Unfortunately, I have heard from history that when nations compete like this, the consequences are dire, as the tensions between China and the U.S.<<<DELIMITER>>>are on a collision course that threatens the global supply chain of advanced chips. In October, the U.S.<<<DELIMITER>>>government recently passed numerous restrictions on certain U.S.<<<DELIMITER>>>It will prohibit all sales of chips and semiconductor manufacturing equipment to Chinese companies.<<<DELIMITER>>>It also prohibits non-U.S.<<<DELIMITER>>>I'm having difficulty selling software and technical collaborations with Chinese right now.<<<DELIMITER>>>China's AI business relies on integrated circuits designed by Nvidia, an American tech company.<<<DELIMITER>>>Even if the tensions ease, the company acknowledges that other challenges will still remain in the pipeline for advanced semiconductors.<<<DELIMITER>>>The Covid-19 lockdowns could disrupt semiconductor manufacturing, lead to climate-related events like Taiwan's drought and Malaysia's flood, and put the global supply of chips at risk. . That's why it's critical to maintain a robust supply chain.<<<DELIMITER>>>and South Korea and China that aim to build their own high-end semiconductor models.<<<DELIMITER>>>Making electronic circuits the size of one-digit nanometers is technically and financially challenging. While many nations are working to make such progress, I'm not sure any of us will. China has been implementing a 2014 plan to become a superpower in semiconductors.<<<DELIMITER>>>But the ultimate goal is to build a domestic market for semiconductors, with the U.S.<<<DELIMITER>>>Sanctions on semiconductor design and manufacturing equipment pose a direct risk. The U.S.<<<DELIMITER>>>government recently passed the CHIPS and Science Act.<<<DELIMITER>>>semiconductor suppliers based primarily in the U.S.<<<DELIMITER>>>would benefit from tax incentives to build factories in the U.S.<<<DELIMITER>>>In addition to providing a tax incentive program, the state will invest in research and development. Intel, which makes chips but keeps up with manufacturing builders, is building at two sites in central Ohio valued at $20 billion. Behind the news: A variety of foreign powerhouses are establishing a presence in the U.S.<<<DELIMITER>>>Taiwan Semiconductor Manufacturing Company, a maker of semiconductors, is building a new facility in Arizona at a cost of $12 billion and expected to open in 2024.<<<DELIMITER>>>Samsung plans to make a $17 billion manufacturing investment in Texas, but the threat of a chip shortage calls for contingency planning by the AI community. What if researchers build solutions using older semiconductors from small data, data-centric approaches, and high-performance model architecture? Optimization.<<<DELIMITER>>>It will also foster international cooperation to meet the growing challenges of global division and polarization.<<<DELIMITER>>>Who would miss semiconductors if they were to become one of the least terrifying effects of a great-power war? You want to build and deploy machine learning applications? Branching out of the Notebook: ML Application Development with GitHub Workshop will take place on November 9, 2022 and teaches you how to apply immediately. Register now. AI is helping hiring applications, and some companies are already using it to screen and interview job applicants.<<<DELIMITER>>>When companies hire people using AI, what does an automated system do? Fear that it will do bad work on every step of hiring.<<<DELIMITER>>>The models are trained on data that contains social biases, which in turn leads to flagrant discrimination in career advances and selection.<<<DELIMITER>>>The path to success is hampered by an apparently random mechanism that keeps automating the path to success.<<<DELIMITER>>>Minority candidates can struggle to navigate the conventional hiring process effectively. . How can automated systems perpetuate bias and errors in the recruiting process? A 2021 study by Accenture and Harvard found that a vast majority of employers in three countries (the U.S., UK, and Germany) use these systems to evaluate candidates. . Many research finds that these hiring algorithms are flawed. For example, MIT Technology Review, which evaluates candidates speaking in English, gave MyInterview and Curious Thing markings for speaking<<<DELIMITER>>>Users of the program include the German automaker BMW and the European airline group Lufthansa. Video interviewing tool HireVue predicts job seekers based on their faces and mannerisms.<<<DELIMITER>>>It discontinued that capability after a member of the scientific advisory board resigned in protest. Automated hiring systems face significant scrutiny, from legislators and companies that use them, over their performance. New York City is expected to mandate by 2023 that employers notify job applicants if they use hiring algorithms and provide non-automated alternatives when they can. Audits will also be held annually to ensure fairness.<<<DELIMITER>>>In 2020, Illinois enacted a similar law. Currently, the European Union's AI Act requires that hiring software be under the strict supervision of humans.<<<DELIMITER>>>Developers who want to market their systems in the European Union must submit a risk assessment and provide evidence that neither the system nor the training data involved are biased in an unacceptable way.<<<DELIMITER>>>UK legislators are contemplating similar legislation. The Data and Trust Alliance, a nonprofit that works to combat technology-induced bias in the workplace, developed tools to assess the fairness of hiring algorithms.<<<DELIMITER>>>22 large companies, including IBM, Meta, and Walmart, took the decision.Although recruiting algorithms are becoming ever more frequent, it's not unusual for companies to have humans oversee the selection process.<<<DELIMITER>>>Why is this integration successful? Because machine learning can sift through hundreds of résumés, human managers will be better positioned to spot desirable people who aren't represented in an algorithm.<<<DELIMITER>>>Humans and machines have subtle but mutually beneficial abilities that we can use to scale faster and more efficiently. . Further, machine learning models have led to applications that our foresight might surprise even more.<<<DELIMITER>>>Are they reprehensible acts? The question is: Are foundation models good at interpreting human speech.<<<DELIMITER>>>They've also demonstrated a high degree of skill in understanding biological and chemical language structures.<<<DELIMITER>>>It will be only a matter of time before someone tries to use the technology to create nefarious weapons of mass destruction. . What's new: Researchers showed that an existing AI system can be used to create chemicals that cause catastrophic harm. . In March, scientists at Collaborations Pharmaceuticals modified a drug discovery model based on a dataset of toxic molecules that it had designed to classify pharmaceutical candidates based on their toxicity to humans.<<<DELIMITER>>>They upgraded the rankings to give the most deadly chemicals. . Within six hours, the model generated 40,000 toxins including chemicals known to exist outside its training data. . The authors contend that their approach might be easy to replicate using open source frameworks and toxicity databases. Personal protective equipment: A researcher told me in an interview that developers of general-purpose models, like the one she uses here, might consider limiting access to mitigate misuse.<<<DELIMITER>>>It's best that the machine learning community sets standards for teaching chemistry to emerging scientists who are aware of the risk of misuse of research results. Despite these worries, a sensible path seems to me: It's critical to carefully assess the harm that new models bring, and limit the harm that is considered.<<<DELIMITER>>>Such a program would face significant opposition from scientists, who would argue that such restrictions threaten the integrity of their work, as well as entrepreneurs who favor free-lance economic development. It also may not sufficiently reduce the threat it presented to them (especially not what we had anticipated when the model was developed).<<<DELIMITER>>>The European Union just started regulating general-purpose AI.<<<DELIMITER>>>But unless a global consensus on how to define dangerous technologies, nations will ignore them.<<<DELIMITER>>>If we can reduce the risks, why not monitor vulnerabilities consistently and ensure they aren't used for evil purposes? I'm worried about the new remote administrative assistant, who looks overly hard at her job.<<<DELIMITER>>>Virtual workers, or bots, are likely to integrate more deeply into the distribution of tasks.<<<DELIMITER>>>The similarity between the artificial robots and humans has become more striking thanks to the invention of computer-generated personality types.<<<DELIMITER>>>Managers will become increasingly uninterested in the matter of human-to-digital assistants, leaving workers around automated co-workers who are on the air around the clock, without error, and declining social invitations. . There have been many programs like Clara, which uses email-based scheduling, and more complex digital entities that look and document like human workers. . WorkFusion offers virtual teammates in six roles including customer service coordinator, insurance underwriter, and transaction screening analyst.<<<DELIMITER>>>Each digital employee is represented by a virtual performer. . Synthesia uses generative adversarial networks to produce realistic videos of people speaking from scripted lines in over a dozen languages.<<<DELIMITER>>>Customers can use the service to create training and sales videos without a human operator. LIA (the LinkedIn Lead Generation Assistant) and Renova Digital offer avatars that help salespeople get multiple deals.<<<DELIMITER>>>Stanford University researchers found that more than 1,000 LinkedIn profiles many of them marketers were generated using face portraits, making them more likely to be fake personas.This creates a bigger threat than a proliferation of offices in unrealistic or unsettling locations.<<<DELIMITER>>>In 2020, scammers used voice recognition to fool a company executive and enabled a Hong Kong bank to transfer $35 million.<<<DELIMITER>>>Con artists using a similar scam stole £243,000 from a major UK energy company in 2019. Confronting fears: I'm thinking: It's hard to work with overly optimistic and highly productive colleagues who may let down peers.<<<DELIMITER>>>Then, if you want to add a touch of relevance to such beings, why not include the occasional latency in your protocols? . Stay updated with weekly AI News and Insights delivered to your inbox. Courses continue to expand the repertoire of knowledge, and The Batch is a wonderful place to talk and collaborate. Careers in AI are also available.<<<DELIMITER>>>Last week, I explained that one difference between traditional software and AI software is the challenge of specifying how technically feasible each one is.<<<DELIMITER>>>Here are some tips to deciding if you can build an AI system.<<<DELIMITER>>>But before you spend money to build a full product, it is worthwhile to test technical feasibility (if you don't have enough data, or if you have a small number of examples (that's large enough to understand the problem but not large enough to train an algorithm), consider the following steps. For unstructured data problems like images, audio, and text, training an algorithm on this data, if it's hard for a human to figure out how to get the job done, will be hard for AI to do. A review of existing literature or an analysis of similar<<<DELIMITER>>>At proof of concept, we routinely fetch training and test data from a single population.<<<DELIMITER>>>For example, if your system doesn't perform well on training data, it could be a sign that input X doesn't provide enough data to predict Y.<<<DELIMITER>>>If we can't improve or improve some input features marked x, then the problem will be very hard to solve. On the other hand, even if our system performs well on initial training data but performs poorly on test cases we haven't seen before, there's still room to improve.<<<DELIMITER>>>By benchmarking human-level performance, we can determine whether a system is feasible, but whether or not a model performs well on a test set will still be hard to define in terms of generalization to real-world data. If a product (such as a medical records system used by various hospitals) serves multiple customers, then data from a handful of customers can shed light on technical feasibility. Given the increased technical risk of building AI products, AI Fund (a sister company of Deeplearning.AI, which helps startups) should focus on assessing a<<<DELIMITER>>>Better technical teams are likely to overcome technical issues in a business context.<<<DELIMITER>>>Over the weekend, my son and I spent quietly discussing his math math for Father's Day dinner.<<<DELIMITER>>>Then, to help her learn, I created custom practice exercises using OpenDevin, an open source platform that generates coding challenges, and used it to write a Python script that generated thoughtful, interactive questions suited to her level of interest and skill so she could keep up.<<<DELIMITER>>>This OpenDevin superhuman computer programmer was able to code faster than I could do it myself! Thus, he helped me technically and positively affected my daughter and me. Only six months ago, coding agents really started to appear.<<<DELIMITER>>>Instead, they've made their performance more reliable and helpful for more people. . A typical workflow for a coding agent is:. Use an LLM to dissect the problem, break it down if necessary into sub-components, generate code based on those components, test the code, and iteratively improve the solution using error data to iterate over it.<<<DELIMITER>>>In this dimension, I see a lot of ideas to explore and build.<<<DELIMITER>>>I'd like to draw attention to important papers like AgentCoder: Multiagent-Code Generation with Iterative Testing and Optimization, by Huang et al.<<<DELIMITER>>>In 2024, researchers Zhong et al., Yang et al. conducted groundbreaking work in software engineering. . Zhong's team presented LDB: A Large Language Model Debugger via Verifying Runtime Execution Step By Step, and Yang's paper SWE-agent: Agent-Computer Interfaces Enable Automated Software Engineering.<<<DELIMITER>>>How can we test code without forcing code users to build test cases?We're thinking: The system we're working on is made of multiple agents each represented by an LLM that plays a role in simulating various parts of the multi-agent architecture.<<<DELIMITER>>>So far, AgentCoder has found that splitting code from a single agent results in better performance than splitting these functions together.<<<DELIMITER>>>This is because, if you write the code and the test data, it unduly interferes with the test set, causing the test set to ignore edge cases that aren't well documented and making sure the output is correct for a particular test set. Often, when people think of testing software, the first thing they think about is output validation (which is to make sure the program generates the correct output for a particular test set).<<<DELIMITER>>>If a test fails, an LLM might be prompted to examine why it failed and try to fix the problem by modifying the code.<<<DELIMITER>>>The LDB also acts as a validation tool to validate the output.<<<DELIMITER>>>You can learn by debugging (LDB) by examining the code step by step as it leaves the error, and at each step feed the LLM a value from a variable to determine if it can determine where the error is located.<<<DELIMITER>>>This is like debugging a traditional programmer to try to figure out when a complex calculation went wrong and isolate it to make it right. Many automated workflows try to mimic real-world developers.<<<DELIMITER>>>Like many research projects in AI, trying to mimic humans is an efficient way to speed up development.If the task is easy for humans to complete, then mimicking their behavior is more cost effective and defensible than starting a new application.<<<DELIMITER>>>But the developers of SWE-agent found that many tools used by human programmers are inefficient for AI.<<<DELIMITER>>>For example, allowing an agent unfettered access to a Bash shell and allowing it to search for that particular piece of code by sequentially executing `cd`, `ls`, and `cat` commands is ineffficient, even if humans can perform the job quickly.<<<DELIMITER>>>Indeed, many popular code editing platforms like Visual Studio Code, Emacs, and Vim make it challenging for LLMs to navigate.<<<DELIMITER>>>Because agents interact with computers in a way that's different from humans, the authors found that building specialized tools to help an agent search for, view, and edit codebases yielded better results. . One reason why research on coding agents has been evolving rapidly is that our tools can be evaluated automatically and reliably.<<<DELIMITER>>>Researchers have benchmarks like HumanEval, MBPP, and SWE-bench that help them benchmark how effective code-generating ideas are and how automatedly they succeed.<<<DELIMITER>>>In contrast, while a lot of research is being done on AI agents that search the web and synthesize articles, it's hard to know how effective they are unless we're getting much better at it as tools like Stanford's Yijia Shao et al. Open source STORM are getting better, but developers still struggle to get help from large language models (LLMs) like Github Copilot, which was released in 2021, and many programmers have used these LLMs to obtain prompt-based code.<<<DELIMITER>>>This rapid transition from basic to high-level code agents has allowed computers to aid programmers in coding tasks more quickly.<<<DELIMITER>>>With these tools, we can expect programming to be much more enjoyable and effective..

evaluation_original_model_text.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

generation_config.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+  "bos_token_id": 0,
+  "decoder_start_token_id": 2,
+  "eos_token_id": 2,
+  "max_length": 200,
+  "pad_token_id": 1,
+  "transformers_version": "4.49.0"
+}

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:faa2359078605bb7fd22e7c793d2a2cdb30ecd76dc36930c6f1b7e50f0854370
+size 2460350816

optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:28d41f3a6babca7ef7c71b535d1637ea772a7534f07cb65851fffe2d494c7a1a
+size 4921015253

rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:66260ee6256efa78d757c1be8b1cb8381906700b60f59438d5ad3ea595d46281
+size 14244

scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:000161925d5d039b308132d09ccf3a330240232f75c2faf117e6414c7dbffbe7
+size 1064

sentencepiece.bpe.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:14bb8dfb35c0ffdea7bc01e56cea38b9e3d5efcdcb9c251d6b40538e1aab555a
+size 4852054

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,54 @@

+{
+  "additional_special_tokens": [
+    "and_Cas"
+  ],
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "cls_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "mask_token": {
+    "content": "<mask>",
+    "lstrip": true,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<pad>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "sep_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,1686 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<pad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256001": {
+      "content": "ace_Arab",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256002": {
+      "content": "ace_Latn",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256003": {
+      "content": "acm_Arab",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256004": {
+      "content": "acq_Arab",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256005": {
+      "content": "aeb_Arab",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256006": {
+      "content": "afr_Latn",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256007": {
+      "content": "ajp_Arab",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256008": {
+      "content": "aka_Latn",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256009": {
+      "content": "amh_Ethi",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256010": {
+      "content": "apc_Arab",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256011": {
+      "content": "arb_Arab",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256012": {
+      "content": "ars_Arab",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256013": {
+      "content": "ary_Arab",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256014": {
+      "content": "arz_Arab",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256015": {
+      "content": "asm_Beng",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256016": {
+      "content": "ast_Latn",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256017": {
+      "content": "awa_Deva",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256018": {
+      "content": "ayr_Latn",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256019": {
+      "content": "azb_Arab",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256020": {
+      "content": "azj_Latn",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256021": {
+      "content": "bak_Cyrl",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256022": {
+      "content": "bam_Latn",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256023": {
+      "content": "ban_Latn",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256024": {
+      "content": "bel_Cyrl",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256025": {
+      "content": "bem_Latn",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256026": {
+      "content": "ben_Beng",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256027": {
+      "content": "bho_Deva",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256028": {
+      "content": "bjn_Arab",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256029": {
+      "content": "bjn_Latn",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256030": {
+      "content": "bod_Tibt",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256031": {
+      "content": "bos_Latn",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256032": {
+      "content": "bug_Latn",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256033": {
+      "content": "bul_Cyrl",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256034": {
+      "content": "cat_Latn",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256035": {
+      "content": "ceb_Latn",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256036": {
+      "content": "ces_Latn",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256037": {
+      "content": "cjk_Latn",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256038": {
+      "content": "ckb_Arab",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256039": {
+      "content": "crh_Latn",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256040": {
+      "content": "cym_Latn",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256041": {
+      "content": "dan_Latn",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256042": {
+      "content": "deu_Latn",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256043": {
+      "content": "dik_Latn",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256044": {
+      "content": "dyu_Latn",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256045": {
+      "content": "dzo_Tibt",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256046": {
+      "content": "ell_Grek",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256047": {
+      "content": "eng_Latn",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256048": {
+      "content": "epo_Latn",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256049": {
+      "content": "est_Latn",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256050": {
+      "content": "eus_Latn",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256051": {
+      "content": "ewe_Latn",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256052": {
+      "content": "fao_Latn",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256053": {
+      "content": "pes_Arab",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256054": {
+      "content": "fij_Latn",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256055": {
+      "content": "fin_Latn",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256056": {
+      "content": "fon_Latn",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256057": {
+      "content": "fra_Latn",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256058": {
+      "content": "fur_Latn",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256059": {
+      "content": "fuv_Latn",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256060": {
+      "content": "gla_Latn",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256061": {
+      "content": "gle_Latn",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256062": {
+      "content": "glg_Latn",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256063": {
+      "content": "grn_Latn",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256064": {
+      "content": "guj_Gujr",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256065": {
+      "content": "hat_Latn",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256066": {
+      "content": "hau_Latn",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256067": {
+      "content": "heb_Hebr",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256068": {
+      "content": "hin_Deva",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256069": {
+      "content": "hne_Deva",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256070": {
+      "content": "hrv_Latn",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256071": {
+      "content": "hun_Latn",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256072": {
+      "content": "hye_Armn",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256073": {
+      "content": "ibo_Latn",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256074": {
+      "content": "ilo_Latn",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256075": {
+      "content": "ind_Latn",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256076": {
+      "content": "isl_Latn",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256077": {
+      "content": "ita_Latn",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256078": {
+      "content": "jav_Latn",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256079": {
+      "content": "jpn_Jpan",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256080": {
+      "content": "kab_Latn",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256081": {
+      "content": "kac_Latn",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256082": {
+      "content": "kam_Latn",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256083": {
+      "content": "kan_Knda",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256084": {
+      "content": "kas_Arab",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256085": {
+      "content": "kas_Deva",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256086": {
+      "content": "kat_Geor",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256087": {
+      "content": "knc_Arab",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256088": {
+      "content": "knc_Latn",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256089": {
+      "content": "kaz_Cyrl",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256090": {
+      "content": "kbp_Latn",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256091": {
+      "content": "kea_Latn",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256092": {
+      "content": "khm_Khmr",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256093": {
+      "content": "kik_Latn",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256094": {
+      "content": "kin_Latn",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256095": {
+      "content": "kir_Cyrl",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256096": {
+      "content": "kmb_Latn",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256097": {
+      "content": "kon_Latn",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256098": {
+      "content": "kor_Hang",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256099": {
+      "content": "kmr_Latn",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256100": {
+      "content": "lao_Laoo",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256101": {
+      "content": "lvs_Latn",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256102": {
+      "content": "lij_Latn",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256103": {
+      "content": "lim_Latn",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256104": {
+      "content": "lin_Latn",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256105": {
+      "content": "lit_Latn",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256106": {
+      "content": "lmo_Latn",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256107": {
+      "content": "ltg_Latn",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256108": {
+      "content": "ltz_Latn",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256109": {
+      "content": "lua_Latn",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256110": {
+      "content": "lug_Latn",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256111": {
+      "content": "luo_Latn",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256112": {
+      "content": "lus_Latn",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256113": {
+      "content": "mag_Deva",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256114": {
+      "content": "mai_Deva",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256115": {
+      "content": "mal_Mlym",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256116": {
+      "content": "mar_Deva",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256117": {
+      "content": "min_Latn",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256118": {
+      "content": "mkd_Cyrl",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256119": {
+      "content": "plt_Latn",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256120": {
+      "content": "mlt_Latn",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256121": {
+      "content": "mni_Beng",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256122": {
+      "content": "khk_Cyrl",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256123": {
+      "content": "mos_Latn",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256124": {
+      "content": "mri_Latn",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256125": {
+      "content": "zsm_Latn",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256126": {
+      "content": "mya_Mymr",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256127": {
+      "content": "nld_Latn",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256128": {
+      "content": "nno_Latn",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256129": {
+      "content": "nob_Latn",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256130": {
+      "content": "npi_Deva",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256131": {
+      "content": "nso_Latn",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256132": {
+      "content": "nus_Latn",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256133": {
+      "content": "nya_Latn",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256134": {
+      "content": "oci_Latn",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256135": {
+      "content": "gaz_Latn",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256136": {
+      "content": "ory_Orya",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256137": {
+      "content": "pag_Latn",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256138": {
+      "content": "pan_Guru",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256139": {
+      "content": "pap_Latn",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256140": {
+      "content": "pol_Latn",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256141": {
+      "content": "por_Latn",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256142": {
+      "content": "prs_Arab",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256143": {
+      "content": "pbt_Arab",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256144": {
+      "content": "quy_Latn",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256145": {
+      "content": "ron_Latn",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256146": {
+      "content": "run_Latn",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256147": {
+      "content": "rus_Cyrl",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256148": {
+      "content": "sag_Latn",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256149": {
+      "content": "san_Deva",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256150": {
+      "content": "sat_Beng",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256151": {
+      "content": "scn_Latn",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256152": {
+      "content": "shn_Mymr",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256153": {
+      "content": "sin_Sinh",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256154": {
+      "content": "slk_Latn",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256155": {
+      "content": "slv_Latn",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256156": {
+      "content": "smo_Latn",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256157": {
+      "content": "sna_Latn",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256158": {
+      "content": "snd_Arab",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256159": {
+      "content": "som_Latn",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256160": {
+      "content": "sot_Latn",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256161": {
+      "content": "spa_Latn",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256162": {
+      "content": "als_Latn",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256163": {
+      "content": "srd_Latn",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256164": {
+      "content": "srp_Cyrl",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256165": {
+      "content": "ssw_Latn",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256166": {
+      "content": "sun_Latn",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256167": {
+      "content": "swe_Latn",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256168": {
+      "content": "swh_Latn",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256169": {
+      "content": "szl_Latn",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256170": {
+      "content": "tam_Taml",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256171": {
+      "content": "tat_Cyrl",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256172": {
+      "content": "tel_Telu",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256173": {
+      "content": "tgk_Cyrl",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256174": {
+      "content": "tgl_Latn",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256175": {
+      "content": "tha_Thai",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256176": {
+      "content": "tir_Ethi",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256177": {
+      "content": "taq_Latn",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256178": {
+      "content": "taq_Tfng",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256179": {
+      "content": "tpi_Latn",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256180": {
+      "content": "tsn_Latn",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256181": {
+      "content": "tso_Latn",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256182": {
+      "content": "tuk_Latn",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256183": {
+      "content": "tum_Latn",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256184": {
+      "content": "tur_Latn",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256185": {
+      "content": "twi_Latn",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256186": {
+      "content": "tzm_Tfng",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256187": {
+      "content": "uig_Arab",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256188": {
+      "content": "ukr_Cyrl",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256189": {
+      "content": "umb_Latn",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256190": {
+      "content": "urd_Arab",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256191": {
+      "content": "uzn_Latn",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256192": {
+      "content": "vec_Latn",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256193": {
+      "content": "vie_Latn",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256194": {
+      "content": "war_Latn",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256195": {
+      "content": "wol_Latn",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256196": {
+      "content": "xho_Latn",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256197": {
+      "content": "ydd_Hebr",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256198": {
+      "content": "yor_Latn",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256199": {
+      "content": "yue_Hant",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256200": {
+      "content": "zho_Hans",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256201": {
+      "content": "zho_Hant",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256202": {
+      "content": "zul_Latn",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256203": {
+      "content": "<mask>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "256204": {
+      "content": "and_Cas",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "and_Cas"
+  ],
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": false,
+  "cls_token": "<s>",
+  "eos_token": "</s>",
+  "extra_special_tokens": {},
+  "legacy_behaviour": false,
+  "mask_token": "<mask>",
+  "model_max_length": 1024,
+  "pad_token": "<pad>",
+  "sep_token": "</s>",
+  "sp_model_kwargs": {},
+  "src_lang": "eng_Latn",
+  "tgt_lang": "and_Cas",
+  "tokenizer_class": "NllbTokenizer",
+  "unk_token": "<unk>"
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,978 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 12.0,
+  "eval_steps": 500,
+  "global_step": 5616,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.002136752136752137,
+      "grad_norm": 6.609935760498047,
+      "learning_rate": 9.998219373219375e-05,
+      "loss": 10.1057,
+      "step": 1
+    },
+    {
+      "epoch": 0.10683760683760683,
+      "grad_norm": 15.156593322753906,
+      "learning_rate": 9.910968660968662e-05,
+      "loss": 6.382,
+      "step": 50
+    },
+    {
+      "epoch": 0.21367521367521367,
+      "grad_norm": 12.51414966583252,
+      "learning_rate": 9.821937321937323e-05,
+      "loss": 3.4357,
+      "step": 100
+    },
+    {
+      "epoch": 0.32051282051282054,
+      "grad_norm": 2.1484034061431885,
+      "learning_rate": 9.732905982905983e-05,
+      "loss": 1.354,
+      "step": 150
+    },
+    {
+      "epoch": 0.42735042735042733,
+      "grad_norm": 0.6900449395179749,
+      "learning_rate": 9.643874643874644e-05,
+      "loss": 0.7456,
+      "step": 200
+    },
+    {
+      "epoch": 0.5341880341880342,
+      "grad_norm": 0.8403804302215576,
+      "learning_rate": 9.554843304843305e-05,
+      "loss": 0.7155,
+      "step": 250
+    },
+    {
+      "epoch": 0.6410256410256411,
+      "grad_norm": 0.5271189212799072,
+      "learning_rate": 9.465811965811966e-05,
+      "loss": 0.6685,
+      "step": 300
+    },
+    {
+      "epoch": 0.7478632478632479,
+      "grad_norm": 0.5689147114753723,
+      "learning_rate": 9.376780626780627e-05,
+      "loss": 0.6531,
+      "step": 350
+    },
+    {
+      "epoch": 0.8547008547008547,
+      "grad_norm": 0.6126474738121033,
+      "learning_rate": 9.287749287749287e-05,
+      "loss": 0.6391,
+      "step": 400
+    },
+    {
+      "epoch": 0.9615384615384616,
+      "grad_norm": 0.4521735906600952,
+      "learning_rate": 9.198717948717949e-05,
+      "loss": 0.6862,
+      "step": 450
+    },
+    {
+      "epoch": 1.0,
+      "eval_bleu": 0.04672201686543051,
+      "eval_loss": 0.5959261655807495,
+      "eval_rouge1": 0.1002624774361735,
+      "eval_rouge2": 0.04186894193526817,
+      "eval_rougeL": 0.0864802858192733,
+      "eval_rougeLsum": 0.08708246891026986,
+      "eval_runtime": 119.4408,
+      "eval_sacrebleu": 4.6722016865430485,
+      "eval_samples_per_second": 3.483,
+      "eval_steps_per_second": 0.435,
+      "step": 468
+    },
+    {
+      "epoch": 1.0683760683760684,
+      "grad_norm": 0.5592613220214844,
+      "learning_rate": 9.10968660968661e-05,
+      "loss": 0.6163,
+      "step": 500
+    },
+    {
+      "epoch": 1.1752136752136753,
+      "grad_norm": 0.5470952391624451,
+      "learning_rate": 9.020655270655272e-05,
+      "loss": 0.5398,
+      "step": 550
+    },
+    {
+      "epoch": 1.282051282051282,
+      "grad_norm": 0.8431143164634705,
+      "learning_rate": 8.931623931623932e-05,
+      "loss": 0.5341,
+      "step": 600
+    },
+    {
+      "epoch": 1.3888888888888888,
+      "grad_norm": 0.952499508857727,
+      "learning_rate": 8.842592592592593e-05,
+      "loss": 0.5174,
+      "step": 650
+    },
+    {
+      "epoch": 1.4957264957264957,
+      "grad_norm": 0.6722558736801147,
+      "learning_rate": 8.753561253561254e-05,
+      "loss": 0.485,
+      "step": 700
+    },
+    {
+      "epoch": 1.6025641025641026,
+      "grad_norm": 0.6445268392562866,
+      "learning_rate": 8.664529914529916e-05,
+      "loss": 0.5493,
+      "step": 750
+    },
+    {
+      "epoch": 1.7094017094017095,
+      "grad_norm": 0.6697048544883728,
+      "learning_rate": 8.575498575498576e-05,
+      "loss": 0.5587,
+      "step": 800
+    },
+    {
+      "epoch": 1.8162393162393162,
+      "grad_norm": 0.39390167593955994,
+      "learning_rate": 8.486467236467237e-05,
+      "loss": 0.5105,
+      "step": 850
+    },
+    {
+      "epoch": 1.9230769230769231,
+      "grad_norm": 0.6466375589370728,
+      "learning_rate": 8.397435897435898e-05,
+      "loss": 0.5139,
+      "step": 900
+    },
+    {
+      "epoch": 2.0,
+      "eval_bleu": 0.016857307800214223,
+      "eval_loss": 0.5903982520103455,
+      "eval_rouge1": 0.1420201153596669,
+      "eval_rouge2": 0.0579418436167993,
+      "eval_rougeL": 0.12100579265783531,
+      "eval_rougeLsum": 0.12159562644037325,
+      "eval_runtime": 118.3383,
+      "eval_sacrebleu": 1.6857307800214218,
+      "eval_samples_per_second": 3.515,
+      "eval_steps_per_second": 0.439,
+      "step": 936
+    },
+    {
+      "epoch": 2.02991452991453,
+      "grad_norm": 1.3548823595046997,
+      "learning_rate": 8.308404558404559e-05,
+      "loss": 0.4904,
+      "step": 950
+    },
+    {
+      "epoch": 2.1367521367521367,
+      "grad_norm": 0.5524053573608398,
+      "learning_rate": 8.21937321937322e-05,
+      "loss": 0.422,
+      "step": 1000
+    },
+    {
+      "epoch": 2.2435897435897436,
+      "grad_norm": 0.6330443024635315,
+      "learning_rate": 8.13034188034188e-05,
+      "loss": 0.3609,
+      "step": 1050
+    },
+    {
+      "epoch": 2.3504273504273505,
+      "grad_norm": 0.4444356858730316,
+      "learning_rate": 8.041310541310541e-05,
+      "loss": 0.3737,
+      "step": 1100
+    },
+    {
+      "epoch": 2.4572649572649574,
+      "grad_norm": 0.47359105944633484,
+      "learning_rate": 7.952279202279203e-05,
+      "loss": 0.4244,
+      "step": 1150
+    },
+    {
+      "epoch": 2.564102564102564,
+      "grad_norm": 0.9119735956192017,
+      "learning_rate": 7.863247863247864e-05,
+      "loss": 0.4327,
+      "step": 1200
+    },
+    {
+      "epoch": 2.6709401709401708,
+      "grad_norm": 0.565905749797821,
+      "learning_rate": 7.774216524216525e-05,
+      "loss": 0.4251,
+      "step": 1250
+    },
+    {
+      "epoch": 2.7777777777777777,
+      "grad_norm": 0.8833394050598145,
+      "learning_rate": 7.685185185185185e-05,
+      "loss": 0.3912,
+      "step": 1300
+    },
+    {
+      "epoch": 2.8846153846153846,
+      "grad_norm": 0.5193057656288147,
+      "learning_rate": 7.596153846153846e-05,
+      "loss": 0.3983,
+      "step": 1350
+    },
+    {
+      "epoch": 2.9914529914529915,
+      "grad_norm": 0.5576235055923462,
+      "learning_rate": 7.507122507122507e-05,
+      "loss": 0.4458,
+      "step": 1400
+    },
+    {
+      "epoch": 3.0,
+      "eval_bleu": 0.02090423126560895,
+      "eval_loss": 0.6054907441139221,
+      "eval_rouge1": 0.14516790628410786,
+      "eval_rouge2": 0.06127825912334307,
+      "eval_rougeL": 0.12493260250258728,
+      "eval_rougeLsum": 0.125945256795481,
+      "eval_runtime": 117.4378,
+      "eval_sacrebleu": 2.090423126560895,
+      "eval_samples_per_second": 3.542,
+      "eval_steps_per_second": 0.443,
+      "step": 1404
+    },
+    {
+      "epoch": 3.0982905982905984,
+      "grad_norm": 0.548673152923584,
+      "learning_rate": 7.418091168091168e-05,
+      "loss": 0.3028,
+      "step": 1450
+    },
+    {
+      "epoch": 3.2051282051282053,
+      "grad_norm": 0.5551701188087463,
+      "learning_rate": 7.32905982905983e-05,
+      "loss": 0.3121,
+      "step": 1500
+    },
+    {
+      "epoch": 3.3119658119658117,
+      "grad_norm": 0.77285236120224,
+      "learning_rate": 7.240028490028491e-05,
+      "loss": 0.3426,
+      "step": 1550
+    },
+    {
+      "epoch": 3.4188034188034186,
+      "grad_norm": 0.5966659784317017,
+      "learning_rate": 7.150997150997152e-05,
+      "loss": 0.316,
+      "step": 1600
+    },
+    {
+      "epoch": 3.5256410256410255,
+      "grad_norm": 0.5765504240989685,
+      "learning_rate": 7.061965811965813e-05,
+      "loss": 0.3229,
+      "step": 1650
+    },
+    {
+      "epoch": 3.6324786324786325,
+      "grad_norm": 0.4516245722770691,
+      "learning_rate": 6.972934472934474e-05,
+      "loss": 0.3153,
+      "step": 1700
+    },
+    {
+      "epoch": 3.7393162393162394,
+      "grad_norm": 0.6088860034942627,
+      "learning_rate": 6.883903133903134e-05,
+      "loss": 0.3197,
+      "step": 1750
+    },
+    {
+      "epoch": 3.8461538461538463,
+      "grad_norm": 0.5132762789726257,
+      "learning_rate": 6.794871794871795e-05,
+      "loss": 0.3189,
+      "step": 1800
+    },
+    {
+      "epoch": 3.952991452991453,
+      "grad_norm": 0.9502617716789246,
+      "learning_rate": 6.705840455840457e-05,
+      "loss": 0.3306,
+      "step": 1850
+    },
+    {
+      "epoch": 4.0,
+      "eval_bleu": 0.011934404928163667,
+      "eval_loss": 0.6525160074234009,
+      "eval_rouge1": 0.08905472709315013,
+      "eval_rouge2": 0.03442774422531769,
+      "eval_rougeL": 0.07354540899487223,
+      "eval_rougeLsum": 0.07383306372734402,
+      "eval_runtime": 118.2147,
+      "eval_sacrebleu": 1.1934404928163673,
+      "eval_samples_per_second": 3.519,
+      "eval_steps_per_second": 0.44,
+      "step": 1872
+    },
+    {
+      "epoch": 4.05982905982906,
+      "grad_norm": 0.5066898465156555,
+      "learning_rate": 6.616809116809118e-05,
+      "loss": 0.2767,
+      "step": 1900
+    },
+    {
+      "epoch": 4.166666666666667,
+      "grad_norm": 0.7272780537605286,
+      "learning_rate": 6.527777777777778e-05,
+      "loss": 0.2513,
+      "step": 1950
+    },
+    {
+      "epoch": 4.273504273504273,
+      "grad_norm": 0.7428900599479675,
+      "learning_rate": 6.438746438746439e-05,
+      "loss": 0.2388,
+      "step": 2000
+    },
+    {
+      "epoch": 4.380341880341881,
+      "grad_norm": 0.6635400652885437,
+      "learning_rate": 6.3497150997151e-05,
+      "loss": 0.2486,
+      "step": 2050
+    },
+    {
+      "epoch": 4.487179487179487,
+      "grad_norm": 1.197430968284607,
+      "learning_rate": 6.260683760683761e-05,
+      "loss": 0.2518,
+      "step": 2100
+    },
+    {
+      "epoch": 4.594017094017094,
+      "grad_norm": 0.6132161021232605,
+      "learning_rate": 6.171652421652422e-05,
+      "loss": 0.261,
+      "step": 2150
+    },
+    {
+      "epoch": 4.700854700854701,
+      "grad_norm": 0.8142279982566833,
+      "learning_rate": 6.082621082621083e-05,
+      "loss": 0.2483,
+      "step": 2200
+    },
+    {
+      "epoch": 4.8076923076923075,
+      "grad_norm": 0.6046409606933594,
+      "learning_rate": 5.9935897435897434e-05,
+      "loss": 0.2468,
+      "step": 2250
+    },
+    {
+      "epoch": 4.914529914529915,
+      "grad_norm": 0.6721770167350769,
+      "learning_rate": 5.9045584045584046e-05,
+      "loss": 0.2362,
+      "step": 2300
+    },
+    {
+      "epoch": 5.0,
+      "eval_bleu": 0.01982264823992269,
+      "eval_loss": 0.6933444142341614,
+      "eval_rouge1": 0.1119075273549715,
+      "eval_rouge2": 0.04113128900127839,
+      "eval_rougeL": 0.09342164825270136,
+      "eval_rougeLsum": 0.09348866145324419,
+      "eval_runtime": 117.21,
+      "eval_sacrebleu": 1.982264823992269,
+      "eval_samples_per_second": 3.549,
+      "eval_steps_per_second": 0.444,
+      "step": 2340
+    },
+    {
+      "epoch": 5.021367521367521,
+      "grad_norm": 0.5348103642463684,
+      "learning_rate": 5.815527065527066e-05,
+      "loss": 0.2422,
+      "step": 2350
+    },
+    {
+      "epoch": 5.128205128205128,
+      "grad_norm": 0.676667332649231,
+      "learning_rate": 5.726495726495726e-05,
+      "loss": 0.1809,
+      "step": 2400
+    },
+    {
+      "epoch": 5.235042735042735,
+      "grad_norm": 0.5570082664489746,
+      "learning_rate": 5.6374643874643875e-05,
+      "loss": 0.2015,
+      "step": 2450
+    },
+    {
+      "epoch": 5.3418803418803416,
+      "grad_norm": 0.5290353298187256,
+      "learning_rate": 5.548433048433048e-05,
+      "loss": 0.202,
+      "step": 2500
+    },
+    {
+      "epoch": 5.448717948717949,
+      "grad_norm": 0.5909162163734436,
+      "learning_rate": 5.459401709401709e-05,
+      "loss": 0.1949,
+      "step": 2550
+    },
+    {
+      "epoch": 5.555555555555555,
+      "grad_norm": 0.7708612680435181,
+      "learning_rate": 5.370370370370371e-05,
+      "loss": 0.196,
+      "step": 2600
+    },
+    {
+      "epoch": 5.662393162393163,
+      "grad_norm": 0.7512525320053101,
+      "learning_rate": 5.281339031339032e-05,
+      "loss": 0.1814,
+      "step": 2650
+    },
+    {
+      "epoch": 5.769230769230769,
+      "grad_norm": 0.5676941275596619,
+      "learning_rate": 5.192307692307693e-05,
+      "loss": 0.1882,
+      "step": 2700
+    },
+    {
+      "epoch": 5.8760683760683765,
+      "grad_norm": 0.6577980518341064,
+      "learning_rate": 5.103276353276354e-05,
+      "loss": 0.1808,
+      "step": 2750
+    },
+    {
+      "epoch": 5.982905982905983,
+      "grad_norm": 0.8404538631439209,
+      "learning_rate": 5.0142450142450145e-05,
+      "loss": 0.1873,
+      "step": 2800
+    },
+    {
+      "epoch": 6.0,
+      "eval_bleu": 0.01596588236585614,
+      "eval_loss": 0.7384710311889648,
+      "eval_rouge1": 0.09046890592711473,
+      "eval_rouge2": 0.033374593261045735,
+      "eval_rougeL": 0.0744312633791386,
+      "eval_rougeLsum": 0.07484231387423937,
+      "eval_runtime": 119.5498,
+      "eval_sacrebleu": 1.5965882365856137,
+      "eval_samples_per_second": 3.48,
+      "eval_steps_per_second": 0.435,
+      "step": 2808
+    },
+    {
+      "epoch": 6.089743589743589,
+      "grad_norm": 0.4537138044834137,
+      "learning_rate": 4.925213675213676e-05,
+      "loss": 0.1509,
+      "step": 2850
+    },
+    {
+      "epoch": 6.196581196581197,
+      "grad_norm": 0.9405523538589478,
+      "learning_rate": 4.836182336182337e-05,
+      "loss": 0.1451,
+      "step": 2900
+    },
+    {
+      "epoch": 6.303418803418803,
+      "grad_norm": 0.690613865852356,
+      "learning_rate": 4.7471509971509974e-05,
+      "loss": 0.1544,
+      "step": 2950
+    },
+    {
+      "epoch": 6.410256410256411,
+      "grad_norm": 0.5879324078559875,
+      "learning_rate": 4.6581196581196586e-05,
+      "loss": 0.1501,
+      "step": 3000
+    },
+    {
+      "epoch": 6.517094017094017,
+      "grad_norm": 0.5145525336265564,
+      "learning_rate": 4.569088319088319e-05,
+      "loss": 0.1472,
+      "step": 3050
+    },
+    {
+      "epoch": 6.6239316239316235,
+      "grad_norm": 0.7620146870613098,
+      "learning_rate": 4.48005698005698e-05,
+      "loss": 0.1499,
+      "step": 3100
+    },
+    {
+      "epoch": 6.730769230769231,
+      "grad_norm": 0.6944196820259094,
+      "learning_rate": 4.3910256410256415e-05,
+      "loss": 0.1381,
+      "step": 3150
+    },
+    {
+      "epoch": 6.837606837606837,
+      "grad_norm": 0.8817417621612549,
+      "learning_rate": 4.301994301994302e-05,
+      "loss": 0.1445,
+      "step": 3200
+    },
+    {
+      "epoch": 6.944444444444445,
+      "grad_norm": 0.4323042929172516,
+      "learning_rate": 4.212962962962963e-05,
+      "loss": 0.1446,
+      "step": 3250
+    },
+    {
+      "epoch": 7.0,
+      "eval_bleu": 0.01693650686803547,
+      "eval_loss": 0.7932016849517822,
+      "eval_rouge1": 0.09734769451765327,
+      "eval_rouge2": 0.035070400258041395,
+      "eval_rougeL": 0.07970926682147024,
+      "eval_rougeLsum": 0.08024775367668704,
+      "eval_runtime": 117.0761,
+      "eval_sacrebleu": 1.693650686803547,
+      "eval_samples_per_second": 3.553,
+      "eval_steps_per_second": 0.444,
+      "step": 3276
+    },
+    {
+      "epoch": 7.051282051282051,
+      "grad_norm": 0.49853190779685974,
+      "learning_rate": 4.123931623931624e-05,
+      "loss": 0.1209,
+      "step": 3300
+    },
+    {
+      "epoch": 7.1581196581196584,
+      "grad_norm": 0.7226719260215759,
+      "learning_rate": 4.034900284900285e-05,
+      "loss": 0.1065,
+      "step": 3350
+    },
+    {
+      "epoch": 7.264957264957265,
+      "grad_norm": 0.4723590910434723,
+      "learning_rate": 3.945868945868946e-05,
+      "loss": 0.1074,
+      "step": 3400
+    },
+    {
+      "epoch": 7.371794871794872,
+      "grad_norm": 0.7136197090148926,
+      "learning_rate": 3.856837606837607e-05,
+      "loss": 0.109,
+      "step": 3450
+    },
+    {
+      "epoch": 7.478632478632479,
+      "grad_norm": 0.7125486135482788,
+      "learning_rate": 3.767806267806268e-05,
+      "loss": 0.1141,
+      "step": 3500
+    },
+    {
+      "epoch": 7.585470085470085,
+      "grad_norm": 0.6564122438430786,
+      "learning_rate": 3.678774928774929e-05,
+      "loss": 0.1079,
+      "step": 3550
+    },
+    {
+      "epoch": 7.6923076923076925,
+      "grad_norm": 0.8024164438247681,
+      "learning_rate": 3.58974358974359e-05,
+      "loss": 0.1256,
+      "step": 3600
+    },
+    {
+      "epoch": 7.799145299145299,
+      "grad_norm": 0.5174832940101624,
+      "learning_rate": 3.500712250712251e-05,
+      "loss": 0.1221,
+      "step": 3650
+    },
+    {
+      "epoch": 7.905982905982906,
+      "grad_norm": 0.5609320402145386,
+      "learning_rate": 3.411680911680912e-05,
+      "loss": 0.1241,
+      "step": 3700
+    },
+    {
+      "epoch": 8.0,
+      "eval_bleu": 0.01642080481375883,
+      "eval_loss": 0.8241382837295532,
+      "eval_rouge1": 0.10111779789655295,
+      "eval_rouge2": 0.03756803371781556,
+      "eval_rougeL": 0.08381874904742523,
+      "eval_rougeLsum": 0.08407042216117307,
+      "eval_runtime": 118.3205,
+      "eval_sacrebleu": 1.6420804813758827,
+      "eval_samples_per_second": 3.516,
+      "eval_steps_per_second": 0.439,
+      "step": 3744
+    },
+    {
+      "epoch": 8.012820512820513,
+      "grad_norm": 0.42624956369400024,
+      "learning_rate": 3.3226495726495725e-05,
+      "loss": 0.114,
+      "step": 3750
+    },
+    {
+      "epoch": 8.11965811965812,
+      "grad_norm": 0.5220057368278503,
+      "learning_rate": 3.2336182336182337e-05,
+      "loss": 0.0839,
+      "step": 3800
+    },
+    {
+      "epoch": 8.226495726495726,
+      "grad_norm": 0.6766859889030457,
+      "learning_rate": 3.144586894586894e-05,
+      "loss": 0.0932,
+      "step": 3850
+    },
+    {
+      "epoch": 8.333333333333334,
+      "grad_norm": 0.5749387741088867,
+      "learning_rate": 3.055555555555556e-05,
+      "loss": 0.1015,
+      "step": 3900
+    },
+    {
+      "epoch": 8.44017094017094,
+      "grad_norm": 0.5146998763084412,
+      "learning_rate": 2.966524216524217e-05,
+      "loss": 0.0841,
+      "step": 3950
+    },
+    {
+      "epoch": 8.547008547008547,
+      "grad_norm": 0.6653253436088562,
+      "learning_rate": 2.8774928774928778e-05,
+      "loss": 0.0943,
+      "step": 4000
+    },
+    {
+      "epoch": 8.653846153846153,
+      "grad_norm": 0.5048075914382935,
+      "learning_rate": 2.7884615384615386e-05,
+      "loss": 0.0922,
+      "step": 4050
+    },
+    {
+      "epoch": 8.760683760683762,
+      "grad_norm": 0.6959784626960754,
+      "learning_rate": 2.6994301994301995e-05,
+      "loss": 0.0934,
+      "step": 4100
+    },
+    {
+      "epoch": 8.867521367521368,
+      "grad_norm": 0.44051143527030945,
+      "learning_rate": 2.6103988603988607e-05,
+      "loss": 0.091,
+      "step": 4150
+    },
+    {
+      "epoch": 8.974358974358974,
+      "grad_norm": 0.49819517135620117,
+      "learning_rate": 2.5213675213675215e-05,
+      "loss": 0.0931,
+      "step": 4200
+    },
+    {
+      "epoch": 9.0,
+      "eval_bleu": 0.014140141904671576,
+      "eval_loss": 0.8493850231170654,
+      "eval_rouge1": 0.10066028182849125,
+      "eval_rouge2": 0.038329211239954836,
+      "eval_rougeL": 0.0832079573438359,
+      "eval_rougeLsum": 0.08395177587376235,
+      "eval_runtime": 116.8091,
+      "eval_sacrebleu": 1.414014190467158,
+      "eval_samples_per_second": 3.561,
+      "eval_steps_per_second": 0.445,
+      "step": 4212
+    },
+    {
+      "epoch": 9.081196581196581,
+      "grad_norm": 0.41481533646583557,
+      "learning_rate": 2.4323361823361824e-05,
+      "loss": 0.0834,
+      "step": 4250
+    },
+    {
+      "epoch": 9.188034188034187,
+      "grad_norm": 0.4956184923648834,
+      "learning_rate": 2.3433048433048436e-05,
+      "loss": 0.0798,
+      "step": 4300
+    },
+    {
+      "epoch": 9.294871794871796,
+      "grad_norm": 0.45922133326530457,
+      "learning_rate": 2.2542735042735044e-05,
+      "loss": 0.0721,
+      "step": 4350
+    },
+    {
+      "epoch": 9.401709401709402,
+      "grad_norm": 0.5544161796569824,
+      "learning_rate": 2.1652421652421653e-05,
+      "loss": 0.0726,
+      "step": 4400
+    },
+    {
+      "epoch": 9.508547008547009,
+      "grad_norm": 0.6368575692176819,
+      "learning_rate": 2.076210826210826e-05,
+      "loss": 0.0794,
+      "step": 4450
+    },
+    {
+      "epoch": 9.615384615384615,
+      "grad_norm": 0.4519352614879608,
+      "learning_rate": 1.987179487179487e-05,
+      "loss": 0.0754,
+      "step": 4500
+    },
+    {
+      "epoch": 9.722222222222221,
+      "grad_norm": 0.4205610752105713,
+      "learning_rate": 1.8981481481481482e-05,
+      "loss": 0.0725,
+      "step": 4550
+    },
+    {
+      "epoch": 9.82905982905983,
+      "grad_norm": 0.5720301866531372,
+      "learning_rate": 1.8091168091168094e-05,
+      "loss": 0.0723,
+      "step": 4600
+    },
+    {
+      "epoch": 9.935897435897436,
+      "grad_norm": 0.7100991010665894,
+      "learning_rate": 1.7200854700854702e-05,
+      "loss": 0.0709,
+      "step": 4650
+    },
+    {
+      "epoch": 10.0,
+      "eval_bleu": 0.013252890319355445,
+      "eval_loss": 0.8691538572311401,
+      "eval_rouge1": 0.10388912458002297,
+      "eval_rouge2": 0.038333106011199644,
+      "eval_rougeL": 0.08534074309444875,
+      "eval_rougeLsum": 0.08616990109781715,
+      "eval_runtime": 118.036,
+      "eval_sacrebleu": 1.3252890319355448,
+      "eval_samples_per_second": 3.524,
+      "eval_steps_per_second": 0.441,
+      "step": 4680
+    },
+    {
+      "epoch": 10.042735042735043,
+      "grad_norm": 0.6815042495727539,
+      "learning_rate": 1.631054131054131e-05,
+      "loss": 0.0736,
+      "step": 4700
+    },
+    {
+      "epoch": 10.149572649572649,
+      "grad_norm": 0.4753289222717285,
+      "learning_rate": 1.542022792022792e-05,
+      "loss": 0.0654,
+      "step": 4750
+    },
+    {
+      "epoch": 10.256410256410255,
+      "grad_norm": 0.5216003656387329,
+      "learning_rate": 1.4529914529914531e-05,
+      "loss": 0.0662,
+      "step": 4800
+    },
+    {
+      "epoch": 10.363247863247864,
+      "grad_norm": 1.007856011390686,
+      "learning_rate": 1.3639601139601142e-05,
+      "loss": 0.0644,
+      "step": 4850
+    },
+    {
+      "epoch": 10.47008547008547,
+      "grad_norm": 0.612964391708374,
+      "learning_rate": 1.274928774928775e-05,
+      "loss": 0.0643,
+      "step": 4900
+    },
+    {
+      "epoch": 10.576923076923077,
+      "grad_norm": 0.6739233732223511,
+      "learning_rate": 1.1858974358974359e-05,
+      "loss": 0.0639,
+      "step": 4950
+    },
+    {
+      "epoch": 10.683760683760683,
+      "grad_norm": 1.0595338344573975,
+      "learning_rate": 1.0968660968660969e-05,
+      "loss": 0.0656,
+      "step": 5000
+    },
+    {
+      "epoch": 10.790598290598291,
+      "grad_norm": 0.5138745307922363,
+      "learning_rate": 1.007834757834758e-05,
+      "loss": 0.0612,
+      "step": 5050
+    },
+    {
+      "epoch": 10.897435897435898,
+      "grad_norm": 0.6354550123214722,
+      "learning_rate": 9.18803418803419e-06,
+      "loss": 0.0659,
+      "step": 5100
+    },
+    {
+      "epoch": 11.0,
+      "eval_bleu": 0.013803126905796853,
+      "eval_loss": 0.8812764286994934,
+      "eval_rouge1": 0.11534416583758171,
+      "eval_rouge2": 0.04454497187398293,
+      "eval_rougeL": 0.0963692302601049,
+      "eval_rougeLsum": 0.09670221754303093,
+      "eval_runtime": 119.7112,
+      "eval_sacrebleu": 1.3803126905796859,
+      "eval_samples_per_second": 3.475,
+      "eval_steps_per_second": 0.434,
+      "step": 5148
+    },
+    {
+      "epoch": 11.004273504273504,
+      "grad_norm": 0.3222929537296295,
+      "learning_rate": 8.297720797720798e-06,
+      "loss": 0.0657,
+      "step": 5150
+    },
+    {
+      "epoch": 11.11111111111111,
+      "grad_norm": 0.604082465171814,
+      "learning_rate": 7.4074074074074075e-06,
+      "loss": 0.0644,
+      "step": 5200
+    },
+    {
+      "epoch": 11.217948717948717,
+      "grad_norm": 0.4696787893772125,
+      "learning_rate": 6.517094017094018e-06,
+      "loss": 0.064,
+      "step": 5250
+    },
+    {
+      "epoch": 11.324786324786325,
+      "grad_norm": 0.4635153114795685,
+      "learning_rate": 5.626780626780627e-06,
+      "loss": 0.0604,
+      "step": 5300
+    },
+    {
+      "epoch": 11.431623931623932,
+      "grad_norm": 0.48816901445388794,
+      "learning_rate": 4.7364672364672365e-06,
+      "loss": 0.0584,
+      "step": 5350
+    },
+    {
+      "epoch": 11.538461538461538,
+      "grad_norm": 0.4458998739719391,
+      "learning_rate": 3.846153846153847e-06,
+      "loss": 0.0547,
+      "step": 5400
+    },
+    {
+      "epoch": 11.645299145299145,
+      "grad_norm": 0.4526476263999939,
+      "learning_rate": 2.955840455840456e-06,
+      "loss": 0.0576,
+      "step": 5450
+    },
+    {
+      "epoch": 11.752136752136753,
+      "grad_norm": 0.4015202224254608,
+      "learning_rate": 2.0655270655270656e-06,
+      "loss": 0.0583,
+      "step": 5500
+    },
+    {
+      "epoch": 11.85897435897436,
+      "grad_norm": 0.8102580308914185,
+      "learning_rate": 1.1752136752136752e-06,
+      "loss": 0.0573,
+      "step": 5550
+    },
+    {
+      "epoch": 11.965811965811966,
+      "grad_norm": 0.40322017669677734,
+      "learning_rate": 2.8490028490028494e-07,
+      "loss": 0.0534,
+      "step": 5600
+    }
+  ],
+  "logging_steps": 50,
+  "max_steps": 5616,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 12,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.215745681784832e+16,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0b6538b00e5c2c39a9250f2c165ebcf94981caceb57d92bc760a58128530a030
+size 5432