Spaces:

YAMITEK
/

text_Generation

Build error

App Files Files Community

YAMITEK commited on Apr 28, 2025

Commit

ba26806

verified ·

1 Parent(s): 0041430

Upload 12 files

Browse files

Files changed (12) hide show

app.py +24 -0
fine_tuned_model (1).zip +3 -0
fine_tuned_model (1)/config.json +39 -0
fine_tuned_model (1)/generation_config.json +6 -0
fine_tuned_model (1)/merges.txt +0 -0
fine_tuned_model (1)/model.safetensors +3 -0
fine_tuned_model (1)/special_tokens_map.json +6 -0
fine_tuned_model (1)/tokenizer.json +0 -0
fine_tuned_model (1)/tokenizer_config.json +21 -0
fine_tuned_model (1)/vocab.json +0 -0
requirements.txt +6 -0
text_generation_finetunning_notebook.ipynb +290 -0

app.py ADDED Viewed

	@@ -0,0 +1,24 @@

+import streamlit as st
+from transformers import AutoModelForCausalLM, AutoTokenizer,pipeline
+import torch
+st.title("Text_Generator Fine tunning model")
+# Load model and tokenizer
+model_dir = "fine_tuned_model (1)"
+tokenizer = AutoTokenizer.from_pretrained(model_dir)
+model = AutoModelForCausalLM.from_pretrained(model_dir)
+code_generator = pipeline("text-generation", model=model, tokenizer=tokenizer)
+inputs_text=st.text_input("Please enter the text",value="def quicksort(arr):")
+if st.button("submit"):
+    generated_code = code_generator(inputs_text, max_length=200, num_return_sequences=1)
+    st.write(generated_code[0]["generated_text"])

fine_tuned_model (1).zip ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e571aaa5e03efcbab67c083fb3884631f00fec87b86aeef60a6dbc298b4ed31a
+size 463917526

fine_tuned_model (1)/config.json ADDED Viewed

	@@ -0,0 +1,39 @@

+{
+  "_name_or_path": "gpt2",
+  "activation_function": "gelu_new",
+  "architectures": [
+    "GPT2LMHeadModel"
+  ],
+  "attn_pdrop": 0.1,
+  "bos_token_id": 50256,
+  "embd_pdrop": 0.1,
+  "eos_token_id": 50256,
+  "initializer_range": 0.02,
+  "layer_norm_epsilon": 1e-05,
+  "model_type": "gpt2",
+  "n_ctx": 1024,
+  "n_embd": 768,
+  "n_head": 12,
+  "n_inner": null,
+  "n_layer": 12,
+  "n_positions": 1024,
+  "reorder_and_upcast_attn": false,
+  "resid_pdrop": 0.1,
+  "scale_attn_by_inverse_layer_idx": false,
+  "scale_attn_weights": true,
+  "summary_activation": null,
+  "summary_first_dropout": 0.1,
+  "summary_proj_to_labels": true,
+  "summary_type": "cls_index",
+  "summary_use_proj": true,
+  "task_specific_params": {
+    "text-generation": {
+      "do_sample": true,
+      "max_length": 50
+    }
+  },
+  "torch_dtype": "float32",
+  "transformers_version": "4.49.0",
+  "use_cache": true,
+  "vocab_size": 50257
+}

fine_tuned_model (1)/generation_config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 50256,
+  "eos_token_id": 50256,
+  "transformers_version": "4.49.0"
+}

fine_tuned_model (1)/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

fine_tuned_model (1)/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e5329d3767b696ddf346aae65954b3679aa13b71d1d4a577be2a1b1e5cfdf7d0
+size 497774208

fine_tuned_model (1)/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "bos_token": "<|endoftext|>",
+  "eos_token": "<|endoftext|>",
+  "pad_token": "<|endoftext|>",
+  "unk_token": "<|endoftext|>"
+}

fine_tuned_model (1)/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

fine_tuned_model (1)/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,21 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "50256": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<|endoftext|>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|endoftext|>",
+  "extra_special_tokens": {},
+  "model_max_length": 1024,
+  "pad_token": "<|endoftext|>",
+  "tokenizer_class": "GPT2Tokenizer",
+  "unk_token": "<|endoftext|>"
+}

fine_tuned_model (1)/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+transformers
+pandas==2.2.2
+torch==2.5.1
+transformers==4.48.3
+streamlit==1.41.1
+bitsandbytes==0.45.3

text_generation_finetunning_notebook.ipynb ADDED Viewed

	@@ -0,0 +1,290 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "9665f082-b1e2-4094-a9c4-f5fa4560e01f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import AutoModelForCausalLM, AutoTokenizer\n",
+    "\n",
+    "model_name = \"gpt2\" \n",
+    "model = AutoModelForCausalLM.from_pretrained(model_name)\n",
+    "tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
+    "\n",
+    "# Ensure the tokenizer uses padding if necessary\n",
+    "tokenizer.pad_token = tokenizer.eos_token  \n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "8c81406c-1335-4491-b8cd-67770e86e390",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from datasets import load_dataset\n",
+    "\n",
+    "dataset = load_dataset(\"wikitext\", \"wikitext-2-raw-v1\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "2fd0c7d7-1c01-416c-af00-2d11a51663f1",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "833d3e6bacf94b4f83849b76e554c187",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Map:   0%|          | 0/36718 [00:00<?, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "def tokenize_function(examples):\n",
+    "    return tokenizer(examples[\"text\"], truncation=True, padding=\"max_length\", max_length=512)\n",
+    "\n",
+    "tokenized_datasets = dataset.map(tokenize_function, batched=True)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "85a7f1be-a72d-4b94-b232-4942616810f9",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/zeus/miniconda3/envs/cloudspace/lib/python3.10/site-packages/transformers/training_args.py:1594: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead\n",
+      "  warnings.warn(\n"
+     ]
+    }
+   ],
+   "source": [
+    "from transformers import TrainingArguments\n",
+    "\n",
+    "training_args = TrainingArguments(\n",
+    "    output_dir=\"./results\",\n",
+    "    evaluation_strategy=\"epoch\",\n",
+    "    save_strategy=\"epoch\",\n",
+    "    per_device_train_batch_size=8,  # Adjust based on your GPU\n",
+    "    per_device_eval_batch_size=8,\n",
+    "    logging_dir=\"./logs\",\n",
+    "    logging_steps=10,\n",
+    "    num_train_epochs=1,\n",
+    "    report_to=\"none\",  # Change to \"wandb\" or \"tensorboard\" if using logging\n",
+    ")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "cb46a328-74ef-420a-b5d7-b3159cc8f5b0",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "\n",
+       "    <div>\n",
+       "      \n",
+       "      <progress value='4590' max='4590' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
+       "      [4590/4590 1:19:10, Epoch 1/1]\n",
+       "    </div>\n",
+       "    <table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       " <tr style=\"text-align: left;\">\n",
+       "      <th>Epoch</th>\n",
+       "      <th>Training Loss</th>\n",
+       "      <th>Validation Loss</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <td>1</td>\n",
+       "      <td>3.239600</td>\n",
+       "      <td>3.291132</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table><p>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "TrainOutput(global_step=4590, training_loss=3.347612351062251, metrics={'train_runtime': 4751.264, 'train_samples_per_second': 7.728, 'train_steps_per_second': 0.966, 'total_flos': 9594120830976000.0, 'train_loss': 3.347612351062251, 'epoch': 1.0})"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling\n",
+    "\n",
+    "data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)\n",
+    "\n",
+    "trainer = Trainer(\n",
+    "    model=model,\n",
+    "    args=training_args,\n",
+    "    train_dataset=tokenized_datasets[\"train\"],\n",
+    "    eval_dataset=tokenized_datasets[\"validation\"],\n",
+    "    data_collator=data_collator,\n",
+    ")\n",
+    "\n",
+    "trainer.train()\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "d257f423-a9ea-4fe2-9fcf-bebcf1cd356d",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "('fine_tuned_model/tokenizer_config.json',\n",
+       " 'fine_tuned_model/special_tokens_map.json',\n",
+       " 'fine_tuned_model/vocab.json',\n",
+       " 'fine_tuned_model/merges.txt',\n",
+       " 'fine_tuned_model/added_tokens.json',\n",
+       " 'fine_tuned_model/tokenizer.json')"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "model.save_pretrained(\"fine_tuned_model\")\n",
+    "tokenizer.save_pretrained(\"fine_tuned_model\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "493e4e36-45a6-4cd2-b37d-2e8e534f1a39",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Folder 'fine_tuned_model' has been zipped as 'fine_tuned_model.zip'.\n"
+     ]
+    }
+   ],
+   "source": [
+    "import shutil\n",
+    "\n",
+    "# Specify the folder to be zipped\n",
+    "folder_path = \"fine_tuned_model\" # Replace with your actual folder name\n",
+    "zip_name = \"fine_tuned_model.zip\"  # Desired zip file name\n",
+    "\n",
+    "# Create a zip archive\n",
+    "shutil.make_archive(zip_name.replace('.zip', ''), 'zip', folder_path)\n",
+    "\n",
+    "print(f\"Folder '{folder_path}' has been zipped as '{zip_name}'.\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "fda9cf8b-1e3c-47c2-8a60-11cccf2d608a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import pipeline"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "id": "d60aa595-6bff-4686-a9ba-3e9b993a54ed",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Device set to use cuda:0\n",
+      "Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "def quicksort(arr): \n",
+      "\n",
+      "Proscure = \n",
+      "\n",
+      "Faced with a choice between the current and previous values, an error's resolution in a new value is not necessarily in order, since the first one is the first one that does not change. Prof will have to return a retry call for all possible errors returned from the previous value, which is equivalent to a new retry ( q @-@ f ). A simple recursion will perform only one recursion on the results. \n",
+      "\n",
+      "A recursion in alliter @-@ ordered values is done if it's possible to reorder them at all. This means a recursion in the first function of an array's contents is done if it isn 't possible to reorder them at all. This means, for example, that an array would have to be returned the same number of times in order to work as an array is. \n",
+      "\n",
+      "A recursion in\n"
+     ]
+    }
+   ],
+   "source": [
+    "code_generator = pipeline(\"text-generation\", model=\"fine_tuned_model\", tokenizer=tokenizer)\n",
+    "\n",
+    "prompt = \"def quicksort(arr):\"\n",
+    "generated_code = code_generator(prompt, max_length=200, num_return_sequences=1)\n",
+    "\n",
+    "print(generated_code[0][\"generated_text\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7c82d049-147d-49e0-bc87-b7793c01dba1",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}