Files changed (3) hide show
  1. Untitled-1.py +0 -0
  2. app.py +99 -0
  3. untitled9.py +139 -0
Untitled-1.py ADDED
File without changes
app.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import torch
3
+ from transformers import AutoTokenizer, AutoModelForCausalLM
4
+
5
+ # Page Config
6
+
7
+ st.set_page_config(
8
+ page_title="AI Text Generator",
9
+ page_icon="🤖",
10
+ layout="wide"
11
+ )
12
+
13
+ # Sidebar
14
+
15
+ st.sidebar.title("⚙️ Settings")
16
+
17
+ model_path = st.sidebar.text_input(
18
+ "Model Path",
19
+ value="gpt2" # change to ./results if fine-tuned
20
+ )
21
+
22
+ max_length = st.sidebar.slider("Max Length", 50, 500, 150)
23
+ temperature = st.sidebar.slider("Temperature (Creativity)", 0.5, 1.5, 0.8)
24
+ top_k = st.sidebar.slider("Top-K", 10, 100, 50)
25
+ top_p = st.sidebar.slider("Top-P", 0.5, 1.0, 0.95)
26
+
27
+ device = "cuda" if torch.cuda.is_available() else "cpu"
28
+ st.sidebar.write(f"Device: **{device.upper()}**")
29
+
30
+ # Title
31
+
32
+ st.title("🤖 Professional AI Text Generator")
33
+ st.markdown("Generate creative and grammatically correct text using a GPT-based model.")
34
+
35
+ # Load Model (cached)
36
+
37
+ @st.cache_resource
38
+ def load_model(path):
39
+ tokenizer = AutoTokenizer.from_pretrained(path)
40
+ tokenizer.pad_token = tokenizer.eos_token
41
+ model = AutoModelForCausalLM.from_pretrained(path)
42
+ model.to(device)
43
+ model.eval()
44
+ return tokenizer, model
45
+
46
+ tokenizer, model = load_model(model_path)
47
+
48
+ # Input Area
49
+
50
+ col1, col2 = st.columns([2, 1])
51
+
52
+ with col1:
53
+ prompt = st.text_area(
54
+ "Enter your prompt:",
55
+ height=200,
56
+ placeholder="Example: Alice was walking through the forest when..."
57
+ )
58
+
59
+ with col2:
60
+ st.info("Tips:\n- Higher temperature = more creative\n- Lower temperature = more accurate\n- Use your fine-tuned model for best results")
61
+
62
+ # Generate Button
63
+
64
+ if st.button("✨ Generate Text", use_container_width=True):
65
+ if prompt.strip() == "":
66
+ st.warning("Please enter a prompt.")
67
+ else:
68
+ with st.spinner("Generating..."):
69
+ inputs = tokenizer(prompt, return_tensors="pt").to(device)
70
+
71
+ ```
72
+ output = model.generate(
73
+ **inputs,
74
+ max_length=max_length,
75
+ temperature=temperature,
76
+ top_k=top_k,
77
+ top_p=top_p,
78
+ do_sample=True,
79
+ pad_token_id=tokenizer.eos_token_id
80
+ )
81
+
82
+ generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
83
+
84
+ st.subheader("Generated Output")
85
+ st.write(generated_text)
86
+
87
+ # Download option
88
+ st.download_button(
89
+ label="📥 Download Text",
90
+ data=generated_text,
91
+ file_name="generated_text.txt",
92
+ mime="text/plain"
93
+ )
94
+ ```
95
+
96
+ # Footer
97
+
98
+ st.markdown("---")
99
+ st.markdown("Built with ❤️ using Streamlit + Transformers")
untitled9.py ADDED
@@ -0,0 +1,139 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """Untitled9.ipynb
3
+
4
+ Automatically generated by Colab.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1foYg-5deMEmFrMZhgelziyR_ei_gEDrG
8
+ """
9
+
10
+ import torch
11
+ print("GPU Available:", torch.cuda.is_available())
12
+ print("Device:", torch.device("cuda" if torch.cuda.is_available() else "cpu"))
13
+
14
+ !pip install transformers datasets nltk -q
15
+
16
+ from datasets import load_dataset
17
+
18
+ ds = load_dataset("Dwaraka/Testing_Dataset_of_Project_Gutebberg_Gothic_Fiction")
19
+
20
+ with open("dataset.txt", "w", encoding="utf-8") as f:
21
+ f.write(text)
22
+
23
+ import re
24
+
25
+ with open("dataset.txt", "r", encoding="utf-8") as f:
26
+ text = f.read()
27
+
28
+ # Remove Gutenberg header/footer
29
+ start = text.find("CHAPTER I")
30
+ end = text.find("End of the Project Gutenberg")
31
+ text = text[start:end]
32
+
33
+ # Basic cleaning
34
+ text = re.sub(r'\n+', '\n', text)
35
+ text = text.lower()
36
+
37
+ with open("clean_text.txt", "w", encoding="utf-8") as f:
38
+ f.write(text)
39
+
40
+ print("Cleaned text length:", len(text))
41
+
42
+ from datasets import load_dataset
43
+
44
+ dataset = load_dataset("text", data_files={"train": "clean_text.txt"})
45
+ print(dataset)
46
+
47
+ from transformers import AutoTokenizer
48
+
49
+ tokenizer = AutoTokenizer.from_pretrained("gpt2")
50
+ tokenizer.pad_token = tokenizer.eos_token
51
+
52
+ def tokenize_function(examples):
53
+ return tokenizer(examples["text"], truncation=True, max_length=128, padding="max_length")
54
+
55
+ tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["text"])
56
+
57
+ # Split the dataset into training and evaluation sets
58
+ tokenized_dataset = tokenized_dataset["train"].train_test_split(test_size=0.1)
59
+
60
+ train_dataset = tokenized_dataset["train"]
61
+ eval_dataset = tokenized_dataset["test"]
62
+
63
+ from transformers import AutoModelForCausalLM
64
+
65
+ model = AutoModelForCausalLM.from_pretrained("gpt2")
66
+
67
+ from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling
68
+
69
+ training_args = TrainingArguments(
70
+ output_dir="./results",
71
+ num_train_epochs=1, # increase to 3 for better results
72
+ per_device_train_batch_size=2,
73
+ save_steps=500,
74
+ save_total_limit=2,
75
+ logging_steps=100,
76
+ fp16=True # GPU acceleration
77
+ )
78
+
79
+ data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
80
+
81
+ train_dataset.set_format("torch", columns=["input_ids", "attention_mask"])
82
+ eval_dataset.set_format("torch", columns=["input_ids", "attention_mask"])
83
+
84
+ trainer = Trainer(
85
+ model=model,
86
+ args=training_args,
87
+ train_dataset=train_dataset,
88
+ eval_dataset=eval_dataset,
89
+ data_collator=data_collator,
90
+ )
91
+
92
+ # Verify the lengths of input_ids in the tokenized_dataset
93
+ inconsistent_lengths = []
94
+ expected_length = 128
95
+
96
+ for i, example in enumerate(tokenized_dataset["train"]):
97
+ if len(example["input_ids"]) != expected_length:
98
+ inconsistent_lengths.append((i, len(example["input_ids"])))
99
+
100
+ if inconsistent_lengths:
101
+ print(f"Found {len(inconsistent_lengths)} examples with inconsistent input_ids lengths:")
102
+ for idx, length in inconsistent_lengths[:10]: # Print first 10 inconsistent examples
103
+ print(f" Example index {idx}: length {length}")
104
+ else:
105
+ print(f"All input_ids in the training dataset have the expected length of {expected_length}.")
106
+
107
+ # Also check for unexpected columns
108
+ print("\nFeatures in tokenized_dataset['train']:")
109
+ print(tokenized_dataset["train"].features)
110
+
111
+ trainer.train()
112
+
113
+ import torch
114
+
115
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
116
+ model.to(device)
117
+
118
+ prompt = "alice was feeling"
119
+ inputs = tokenizer(prompt, return_tensors="pt").to(device)
120
+
121
+ output = model.generate(
122
+ **inputs,
123
+ max_length=100,
124
+ temperature=0.8,
125
+ top_k=50,
126
+ top_p=0.95,
127
+ do_sample=True
128
+ )
129
+
130
+ print(tokenizer.decode(output[0], skip_special_tokens=True))
131
+
132
+ import math
133
+
134
+ eval_results = trainer.evaluate()
135
+ perplexity = math.exp(eval_results["eval_loss"])
136
+ print("Perplexity:", perplexity)
137
+
138
+
139
+