| import pandas as pd |
| import numpy as np |
| from transformers import GPT2Tokenizer, GPT2Model |
| from sklearn.preprocessing import MultiLabelBinarizer |
| from torch import nn |
| import torch |
| import openai |
| from collections import Counter |
| import nltk |
| from nltk.corpus import stopwords |
| from nltk.tokenize import word_tokenize |
|
|
| class GenreClassifier(nn.Module): |
| def __init__(self, num_genres=20): |
| super().__init__() |
| self.gpt2 = GPT2Model.from_pretrained('gpt2') |
| self.dropout = nn.Dropout(0.1) |
| self.genre_classifier = nn.Linear(768, num_genres) |
| self.sigmoid = nn.Sigmoid() |
| |
| def forward(self, input_ids, attention_mask): |
| outputs = self.gpt2(input_ids=input_ids, attention_mask=attention_mask) |
| pooled_output = outputs[0].mean(dim=1) |
| pooled_output = self.dropout(pooled_output) |
| genre_logits = self.genre_classifier(pooled_output) |
| return self.sigmoid(genre_logits) |
|
|
| class BookGenreAnalyzer: |
| def __init__(self, api_key): |
| """Initialize the analyzer with OpenAI API key""" |
| self.openai.api_key = api_key |
| self.tokenizer = GPT2Tokenizer.from_pretrained('gpt2') |
| self.model = GenreClassifier() |
| self.genre_labels = self._load_genre_labels() |
| nltk.download('punkt') |
| nltk.download('stopwords') |
| self.stop_words = set(stopwords.words('english')) |
| |
| def _load_genre_labels(self): |
| """Load predefined genre labels""" |
| |
| return [ |
| "Fiction", "Non-fiction", "Mystery", "Romance", "Science Fiction", |
| "Fantasy", "Thriller", "Horror", "Historical Fiction", "Biography", |
| "Self-help", "Business", "Science", "Philosophy", "Poetry", |
| "Drama", "Adventure", "Literary Fiction", "Young Adult", "Children's" |
| ] |
| |
| def preprocess_text(self, text): |
| """Preprocess the book text""" |
| |
| tokens = word_tokenize(text.lower()) |
| tokens = [t for t in tokens if t not in self.stop_words] |
| |
| |
| encodings = self.tokenizer( |
| ' '.join(tokens), |
| truncation=True, |
| max_length=1024, |
| padding='max_length', |
| return_tensors='pt' |
| ) |
| return encodings |
| |
| def extract_features(self, text): |
| """Extract relevant features from the text""" |
| encodings = self.preprocess_text(text) |
| with torch.no_grad(): |
| features = self.model( |
| input_ids=encodings['input_ids'], |
| attention_mask=encodings['attention_mask'] |
| ) |
| return features |
| |
| def fine_tune_with_gpt3(self, training_data): |
| """Fine-tune the model using GPT-3""" |
| |
| formatted_data = [] |
| for book_text, genres in training_data: |
| formatted_data.append({ |
| "prompt": f"Book text: {book_text[:1000]}...\nGenres:", |
| "completion": f" {', '.join(genres)}" |
| }) |
| |
| |
| try: |
| response = openai.FineTune.create( |
| training_file=self._upload_training_data(formatted_data), |
| model="gpt-3", |
| n_epochs=3, |
| batch_size=4, |
| learning_rate_multiplier=0.1 |
| ) |
| return response |
| except Exception as e: |
| print(f"Fine-tuning error: {e}") |
| return None |
| |
| def _upload_training_data(self, formatted_data): |
| """Upload training data to OpenAI""" |
| import json |
| with open('training_data.jsonl', 'w') as f: |
| for entry in formatted_data: |
| json.dump(entry, f) |
| f.write('\n') |
| |
| with open('training_data.jsonl', 'rb') as f: |
| response = openai.File.create( |
| file=f, |
| purpose='fine-tune' |
| ) |
| return response.id |
| |
| def analyze_book(self, book_text): |
| """Analyze a book and return top 20 genres with confidence scores""" |
| |
| features = self.extract_features(book_text) |
| predictions = features.numpy()[0] |
| |
| |
| try: |
| response = openai.Completion.create( |
| model="gpt-3", |
| prompt=f"Book text: {book_text[:1000]}...\nGenres:", |
| max_tokens=100, |
| temperature=0.3 |
| ) |
| gpt3_genres = response.choices[0].text.strip().split(', ') |
| except: |
| gpt3_genres = [] |
| |
| |
| genres_with_scores = [ |
| (genre, float(score)) |
| for genre, score in zip(self.genre_labels, predictions) |
| ] |
| |
| |
| for genre, score in genres_with_scores: |
| if genre in gpt3_genres: |
| score *= 1.2 |
| |
| |
| return sorted(genres_with_scores, key=lambda x: x[1], reverse=True)[:20] |
|
|
| |
| def main(): |
| |
| analyzer = BookGenreAnalyzer('your-api-key') |
| |
| |
| book_text = """ |
| [Your book text here] |
| """ |
| |
| |
| genres = analyzer.analyze_book(book_text) |
| |
| |
| print("\nTop 20 Genres:") |
| for genre, confidence in genres: |
| print(f"{genre}: {confidence:.2%}") |
| |
| |
| training_data = [ |
| ("Book 1 text...", ["Mystery", "Thriller"]), |
| ("Book 2 text...", ["Science Fiction", "Adventure"]), |
| |
| ] |
| |
| fine_tune_response = analyzer.fine_tune_with_gpt3(training_data) |
| if fine_tune_response: |
| print("\nFine-tuning job created successfully!") |
|
|
| if __name__ == "__main__": |
| main() |