Model_finetuning / README.md

Create README.md

b1f9cb7 verified 11 months ago

6.16 kB

	import pandas as pd
	import numpy as np
	from transformers import GPT2Tokenizer, GPT2Model
	from sklearn.preprocessing import MultiLabelBinarizer
	from torch import nn
	import torch
	import openai
	from collections import Counter
	import nltk
	from nltk.corpus import stopwords
	from nltk.tokenize import word_tokenize

	class GenreClassifier(nn.Module):
	def __init__(self, num_genres=20):
	super().__init__()
	self.gpt2 = GPT2Model.from_pretrained('gpt2')
	self.dropout = nn.Dropout(0.1)
	self.genre_classifier = nn.Linear(768, num_genres) # 768 is GPT2's hidden size
	self.sigmoid = nn.Sigmoid()

	def forward(self, input_ids, attention_mask):
	outputs = self.gpt2(input_ids=input_ids, attention_mask=attention_mask)
	pooled_output = outputs[0].mean(dim=1) # Average pooling
	pooled_output = self.dropout(pooled_output)
	genre_logits = self.genre_classifier(pooled_output)
	return self.sigmoid(genre_logits)

	class BookGenreAnalyzer:
	def __init__(self, api_key):
	"""Initialize the analyzer with OpenAI API key"""
	self.openai.api_key = api_key
	self.tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
	self.model = GenreClassifier()
	self.genre_labels = self._load_genre_labels()
	nltk.download('punkt')
	nltk.download('stopwords')
	self.stop_words = set(stopwords.words('english'))

	def _load_genre_labels(self):
	"""Load predefined genre labels"""
	# You would typically load these from a file or database
	return [
	"Fiction", "Non-fiction", "Mystery", "Romance", "Science Fiction",
	"Fantasy", "Thriller", "Horror", "Historical Fiction", "Biography",
	"Self-help", "Business", "Science", "Philosophy", "Poetry",
	"Drama", "Adventure", "Literary Fiction", "Young Adult", "Children's"
	]

	def preprocess_text(self, text):
	"""Preprocess the book text"""
	# Tokenize and remove stop words
	tokens = word_tokenize(text.lower())
	tokens = [t for t in tokens if t not in self.stop_words]

	# Convert to GPT2 tokens
	encodings = self.tokenizer(
	' '.join(tokens),
	truncation=True,
	max_length=1024,
	padding='max_length',
	return_tensors='pt'
	)
	return encodings

	def extract_features(self, text):
	"""Extract relevant features from the text"""
	encodings = self.preprocess_text(text)
	with torch.no_grad():
	features = self.model(
	input_ids=encodings['input_ids'],
	attention_mask=encodings['attention_mask']
	)
	return features

	def fine_tune_with_gpt3(self, training_data):
	"""Fine-tune the model using GPT-3"""
	# Prepare training data in the format expected by OpenAI
	formatted_data = []
	for book_text, genres in training_data:
	formatted_data.append({
	"prompt": f"Book text: {book_text[:1000]}...\nGenres:",
	"completion": f" {', '.join(genres)}"
	})

	# Create fine-tuning job
	try:
	response = openai.FineTune.create(
	training_file=self._upload_training_data(formatted_data),
	model="gpt-3",
	n_epochs=3,
	batch_size=4,
	learning_rate_multiplier=0.1
	)
	return response
	except Exception as e:
	print(f"Fine-tuning error: {e}")
	return None

	def _upload_training_data(self, formatted_data):
	"""Upload training data to OpenAI"""
	import json
	with open('training_data.jsonl', 'w') as f:
	for entry in formatted_data:
	json.dump(entry, f)
	f.write('\n')

	with open('training_data.jsonl', 'rb') as f:
	response = openai.File.create(
	file=f,
	purpose='fine-tune'
	)
	return response.id

	def analyze_book(self, book_text):
	"""Analyze a book and return top 20 genres with confidence scores"""
	# Get base predictions from our model
	features = self.extract_features(book_text)
	predictions = features.numpy()[0]

	# Use GPT-3 to enhance predictions
	try:
	response = openai.Completion.create(
	model="gpt-3", # Use fine-tuned model ID if available
	prompt=f"Book text: {book_text[:1000]}...\nGenres:",
	max_tokens=100,
	temperature=0.3
	)
	gpt3_genres = response.choices[0].text.strip().split(', ')
	except:
	gpt3_genres = []

	# Combine both predictions
	genres_with_scores = [
	(genre, float(score))
	for genre, score in zip(self.genre_labels, predictions)
	]

	# Boost scores for genres mentioned by GPT-3
	for genre, score in genres_with_scores:
	if genre in gpt3_genres:
	score *= 1.2

	# Sort and return top 20
	return sorted(genres_with_scores, key=lambda x: x[1], reverse=True)[:20]

	# Example usage
	def main():
	# Initialize analyzer
	analyzer = BookGenreAnalyzer('your-api-key')

	# Example book text
	book_text = """
	[Your book text here]
	"""

	# Get genre predictions
	genres = analyzer.analyze_book(book_text)

	# Print results
	print("\nTop 20 Genres:")
	for genre, confidence in genres:
	print(f"{genre}: {confidence:.2%}")

	# Example of fine-tuning
	training_data = [
	("Book 1 text...", ["Mystery", "Thriller"]),
	("Book 2 text...", ["Science Fiction", "Adventure"]),
	# Add more training examples
	]

	fine_tune_response = analyzer.fine_tune_with_gpt3(training_data)
	if fine_tune_response:
	print("\nFine-tuning job created successfully!")

	if __name__ == "__main__":
	main()