MLCraftsman commited on
Commit
7c2f77f
·
verified ·
1 Parent(s): 484b963

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +124 -0
  2. untitled9.py +139 -0
app.py ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import torch
3
+ from transformers import AutoTokenizer, AutoModelForCausalLM
4
+
5
+ # -----------------------------
6
+ # Page Configuration
7
+ # -----------------------------
8
+ st.set_page_config(
9
+ page_title="AI Text Generator",
10
+ page_icon="🤖",
11
+ layout="wide"
12
+ )
13
+
14
+ # -----------------------------
15
+ # Sidebar Settings
16
+ # -----------------------------
17
+ st.sidebar.title("⚙️ Settings")
18
+
19
+ model_path = st.sidebar.text_input(
20
+ "Model Path",
21
+ value="gpt2" # Change to "./results" if using fine-tuned model
22
+ )
23
+
24
+ max_length = st.sidebar.slider("Max Length", 50, 500, 150)
25
+ temperature = st.sidebar.slider("Temperature (Creativity)", 0.5, 1.5, 0.8)
26
+ top_k = st.sidebar.slider("Top-K", 10, 100, 50)
27
+ top_p = st.sidebar.slider("Top-P", 0.5, 1.0, 0.95)
28
+
29
+ device = "cuda" if torch.cuda.is_available() else "cpu"
30
+ st.sidebar.write(f"Device: **{device.upper()}**")
31
+
32
+ # -----------------------------
33
+ # Title
34
+ # -----------------------------
35
+ st.title("🤖 Professional AI Text Generator")
36
+ st.markdown(
37
+ "Generate creative and grammatically correct text using a GPT-based model."
38
+ )
39
+
40
+ # -----------------------------
41
+ # Load Model (Cached)
42
+ # -----------------------------
43
+ @st.cache_resource
44
+ def load_model(path):
45
+ tokenizer = AutoTokenizer.from_pretrained(path)
46
+ tokenizer.pad_token = tokenizer.eos_token
47
+
48
+ model = AutoModelForCausalLM.from_pretrained(path)
49
+ model.to(device)
50
+ model.eval()
51
+
52
+ return tokenizer, model
53
+
54
+ # Load model safely
55
+ try:
56
+ tokenizer, model = load_model(model_path)
57
+ except Exception as e:
58
+ st.error(f"Error loading model: {e}")
59
+ st.stop()
60
+
61
+ # -----------------------------
62
+ # Input Area
63
+ # -----------------------------
64
+ col1, col2 = st.columns([2, 1])
65
+
66
+ with col1:
67
+ prompt = st.text_area(
68
+ "Enter your prompt:",
69
+ height=200,
70
+ placeholder="Example: Alice was walking through the forest when..."
71
+ )
72
+
73
+ with col2:
74
+ st.info(
75
+ "Tips:\n"
76
+ "- Higher temperature = more creative\n"
77
+ "- Lower temperature = more accurate\n"
78
+ "- Use your fine-tuned model for best results"
79
+ )
80
+
81
+ # -----------------------------
82
+ # Generate Text
83
+ # -----------------------------
84
+ if st.button("✨ Generate Text", use_container_width=True):
85
+
86
+ if prompt.strip() == "":
87
+ st.warning("Please enter a prompt.")
88
+ else:
89
+ with st.spinner("Generating..."):
90
+
91
+ inputs = tokenizer(prompt, return_tensors="pt").to(device)
92
+
93
+ output = model.generate(
94
+ **inputs,
95
+ max_length=max_length,
96
+ temperature=temperature,
97
+ top_k=top_k,
98
+ top_p=top_p,
99
+ do_sample=True,
100
+ pad_token_id=tokenizer.eos_token_id
101
+ )
102
+
103
+ generated_text = tokenizer.decode(
104
+ output[0],
105
+ skip_special_tokens=True
106
+ )
107
+
108
+ st.subheader("Generated Output")
109
+ st.write(generated_text)
110
+
111
+ # Download Button
112
+ st.download_button(
113
+ label="📥 Download Text",
114
+ data=generated_text,
115
+ file_name="generated_text.txt",
116
+ mime="text/plain"
117
+ )
118
+
119
+ # -----------------------------
120
+ # Footer
121
+ # -----------------------------
122
+ st.markdown("---")
123
+ st.markdown("Built with ❤️ using Streamlit + Transformers")
124
+
untitled9.py ADDED
@@ -0,0 +1,139 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """Untitled9.ipynb
3
+
4
+ Automatically generated by Colab.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1foYg-5deMEmFrMZhgelziyR_ei_gEDrG
8
+ """
9
+
10
+ import torch
11
+ print("GPU Available:", torch.cuda.is_available())
12
+ print("Device:", torch.device("cuda" if torch.cuda.is_available() else "cpu"))
13
+
14
+ !pip install transformers datasets nltk -q
15
+
16
+ from datasets import load_dataset
17
+
18
+ ds = load_dataset("Dwaraka/Testing_Dataset_of_Project_Gutebberg_Gothic_Fiction")
19
+
20
+ with open("dataset.txt", "w", encoding="utf-8") as f:
21
+ f.write(text)
22
+
23
+ import re
24
+
25
+ with open("dataset.txt", "r", encoding="utf-8") as f:
26
+ text = f.read()
27
+
28
+ # Remove Gutenberg header/footer
29
+ start = text.find("CHAPTER I")
30
+ end = text.find("End of the Project Gutenberg")
31
+ text = text[start:end]
32
+
33
+ # Basic cleaning
34
+ text = re.sub(r'\n+', '\n', text)
35
+ text = text.lower()
36
+
37
+ with open("clean_text.txt", "w", encoding="utf-8") as f:
38
+ f.write(text)
39
+
40
+ print("Cleaned text length:", len(text))
41
+
42
+ from datasets import load_dataset
43
+
44
+ dataset = load_dataset("text", data_files={"train": "clean_text.txt"})
45
+ print(dataset)
46
+
47
+ from transformers import AutoTokenizer
48
+
49
+ tokenizer = AutoTokenizer.from_pretrained("gpt2")
50
+ tokenizer.pad_token = tokenizer.eos_token
51
+
52
+ def tokenize_function(examples):
53
+ return tokenizer(examples["text"], truncation=True, max_length=128, padding="max_length")
54
+
55
+ tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["text"])
56
+
57
+ # Split the dataset into training and evaluation sets
58
+ tokenized_dataset = tokenized_dataset["train"].train_test_split(test_size=0.1)
59
+
60
+ train_dataset = tokenized_dataset["train"]
61
+ eval_dataset = tokenized_dataset["test"]
62
+
63
+ from transformers import AutoModelForCausalLM
64
+
65
+ model = AutoModelForCausalLM.from_pretrained("gpt2")
66
+
67
+ from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling
68
+
69
+ training_args = TrainingArguments(
70
+ output_dir="./results",
71
+ num_train_epochs=1, # increase to 3 for better results
72
+ per_device_train_batch_size=2,
73
+ save_steps=500,
74
+ save_total_limit=2,
75
+ logging_steps=100,
76
+ fp16=True # GPU acceleration
77
+ )
78
+
79
+ data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
80
+
81
+ train_dataset.set_format("torch", columns=["input_ids", "attention_mask"])
82
+ eval_dataset.set_format("torch", columns=["input_ids", "attention_mask"])
83
+
84
+ trainer = Trainer(
85
+ model=model,
86
+ args=training_args,
87
+ train_dataset=train_dataset,
88
+ eval_dataset=eval_dataset,
89
+ data_collator=data_collator,
90
+ )
91
+
92
+ # Verify the lengths of input_ids in the tokenized_dataset
93
+ inconsistent_lengths = []
94
+ expected_length = 128
95
+
96
+ for i, example in enumerate(tokenized_dataset["train"]):
97
+ if len(example["input_ids"]) != expected_length:
98
+ inconsistent_lengths.append((i, len(example["input_ids"])))
99
+
100
+ if inconsistent_lengths:
101
+ print(f"Found {len(inconsistent_lengths)} examples with inconsistent input_ids lengths:")
102
+ for idx, length in inconsistent_lengths[:10]: # Print first 10 inconsistent examples
103
+ print(f" Example index {idx}: length {length}")
104
+ else:
105
+ print(f"All input_ids in the training dataset have the expected length of {expected_length}.")
106
+
107
+ # Also check for unexpected columns
108
+ print("\nFeatures in tokenized_dataset['train']:")
109
+ print(tokenized_dataset["train"].features)
110
+
111
+ trainer.train()
112
+
113
+ import torch
114
+
115
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
116
+ model.to(device)
117
+
118
+ prompt = "alice was feeling"
119
+ inputs = tokenizer(prompt, return_tensors="pt").to(device)
120
+
121
+ output = model.generate(
122
+ **inputs,
123
+ max_length=100,
124
+ temperature=0.8,
125
+ top_k=50,
126
+ top_p=0.95,
127
+ do_sample=True
128
+ )
129
+
130
+ print(tokenizer.decode(output[0], skip_special_tokens=True))
131
+
132
+ import math
133
+
134
+ eval_results = trainer.evaluate()
135
+ perplexity = math.exp(eval_results["eval_loss"])
136
+ print("Perplexity:", perplexity)
137
+
138
+
139
+