| import pandas as pd | |
| import numpy as np | |
| from transformers import GPT2Tokenizer, GPT2Model | |
| from sklearn.preprocessing import MultiLabelBinarizer | |
| from torch import nn | |
| import torch | |
| import openai | |
| from collections import Counter | |
| import nltk | |
| from nltk.corpus import stopwords | |
| from nltk.tokenize import word_tokenize | |
| class GenreClassifier(nn.Module): | |
| def __init__(self, num_genres=20): | |
| super().__init__() | |
| self.gpt2 = GPT2Model.from_pretrained('gpt2') | |
| self.dropout = nn.Dropout(0.1) | |
| self.genre_classifier = nn.Linear(768, num_genres) # 768 is GPT2's hidden size | |
| self.sigmoid = nn.Sigmoid() | |
| def forward(self, input_ids, attention_mask): | |
| outputs = self.gpt2(input_ids=input_ids, attention_mask=attention_mask) | |
| pooled_output = outputs[0].mean(dim=1) # Average pooling | |
| pooled_output = self.dropout(pooled_output) | |
| genre_logits = self.genre_classifier(pooled_output) | |
| return self.sigmoid(genre_logits) | |
| class BookGenreAnalyzer: | |
| def __init__(self, api_key): | |
| """Initialize the analyzer with OpenAI API key""" | |
| self.openai.api_key = api_key | |
| self.tokenizer = GPT2Tokenizer.from_pretrained('gpt2') | |
| self.model = GenreClassifier() | |
| self.genre_labels = self._load_genre_labels() | |
| nltk.download('punkt') | |
| nltk.download('stopwords') | |
| self.stop_words = set(stopwords.words('english')) | |
| def _load_genre_labels(self): | |
| """Load predefined genre labels""" | |
| # You would typically load these from a file or database | |
| return [ | |
| "Fiction", "Non-fiction", "Mystery", "Romance", "Science Fiction", | |
| "Fantasy", "Thriller", "Horror", "Historical Fiction", "Biography", | |
| "Self-help", "Business", "Science", "Philosophy", "Poetry", | |
| "Drama", "Adventure", "Literary Fiction", "Young Adult", "Children's" | |
| ] | |
| def preprocess_text(self, text): | |
| """Preprocess the book text""" | |
| # Tokenize and remove stop words | |
| tokens = word_tokenize(text.lower()) | |
| tokens = [t for t in tokens if t not in self.stop_words] | |
| # Convert to GPT2 tokens | |
| encodings = self.tokenizer( | |
| ' '.join(tokens), | |
| truncation=True, | |
| max_length=1024, | |
| padding='max_length', | |
| return_tensors='pt' | |
| ) | |
| return encodings | |
| def extract_features(self, text): | |
| """Extract relevant features from the text""" | |
| encodings = self.preprocess_text(text) | |
| with torch.no_grad(): | |
| features = self.model( | |
| input_ids=encodings['input_ids'], | |
| attention_mask=encodings['attention_mask'] | |
| ) | |
| return features | |
| def fine_tune_with_gpt3(self, training_data): | |
| """Fine-tune the model using GPT-3""" | |
| # Prepare training data in the format expected by OpenAI | |
| formatted_data = [] | |
| for book_text, genres in training_data: | |
| formatted_data.append({ | |
| "prompt": f"Book text: {book_text[:1000]}...\nGenres:", | |
| "completion": f" {', '.join(genres)}" | |
| }) | |
| # Create fine-tuning job | |
| try: | |
| response = openai.FineTune.create( | |
| training_file=self._upload_training_data(formatted_data), | |
| model="gpt-3", | |
| n_epochs=3, | |
| batch_size=4, | |
| learning_rate_multiplier=0.1 | |
| ) | |
| return response | |
| except Exception as e: | |
| print(f"Fine-tuning error: {e}") | |
| return None | |
| def _upload_training_data(self, formatted_data): | |
| """Upload training data to OpenAI""" | |
| import json | |
| with open('training_data.jsonl', 'w') as f: | |
| for entry in formatted_data: | |
| json.dump(entry, f) | |
| f.write('\n') | |
| with open('training_data.jsonl', 'rb') as f: | |
| response = openai.File.create( | |
| file=f, | |
| purpose='fine-tune' | |
| ) | |
| return response.id | |
| def analyze_book(self, book_text): | |
| """Analyze a book and return top 20 genres with confidence scores""" | |
| # Get base predictions from our model | |
| features = self.extract_features(book_text) | |
| predictions = features.numpy()[0] | |
| # Use GPT-3 to enhance predictions | |
| try: | |
| response = openai.Completion.create( | |
| model="gpt-3", # Use fine-tuned model ID if available | |
| prompt=f"Book text: {book_text[:1000]}...\nGenres:", | |
| max_tokens=100, | |
| temperature=0.3 | |
| ) | |
| gpt3_genres = response.choices[0].text.strip().split(', ') | |
| except: | |
| gpt3_genres = [] | |
| # Combine both predictions | |
| genres_with_scores = [ | |
| (genre, float(score)) | |
| for genre, score in zip(self.genre_labels, predictions) | |
| ] | |
| # Boost scores for genres mentioned by GPT-3 | |
| for genre, score in genres_with_scores: | |
| if genre in gpt3_genres: | |
| score *= 1.2 | |
| # Sort and return top 20 | |
| return sorted(genres_with_scores, key=lambda x: x[1], reverse=True)[:20] | |
| # Example usage | |
| def main(): | |
| # Initialize analyzer | |
| analyzer = BookGenreAnalyzer('your-api-key') | |
| # Example book text | |
| book_text = """ | |
| [Your book text here] | |
| """ | |
| # Get genre predictions | |
| genres = analyzer.analyze_book(book_text) | |
| # Print results | |
| print("\nTop 20 Genres:") | |
| for genre, confidence in genres: | |
| print(f"{genre}: {confidence:.2%}") | |
| # Example of fine-tuning | |
| training_data = [ | |
| ("Book 1 text...", ["Mystery", "Thriller"]), | |
| ("Book 2 text...", ["Science Fiction", "Adventure"]), | |
| # Add more training examples | |
| ] | |
| fine_tune_response = analyzer.fine_tune_with_gpt3(training_data) | |
| if fine_tune_response: | |
| print("\nFine-tuning job created successfully!") | |
| if __name__ == "__main__": | |
| main() |