Upload NLP model for cataloging created also by me (Ayman Osman).ipynb
Browse files
NLP model for cataloging created also by me (Ayman Osman).ipynb
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"provenance":[{"file_id":"1SFOlZIqzDD4q8ejIM-6zYcljeDM6_3q9","timestamp":1733549397812}],"authorship_tag":"ABX9TyPgsub6zrkDLLFOowwqOrmx"},"kernelspec":{"name":"python3","display_name":"Python 3"},"language_info":{"name":"python"}},"cells":[{"cell_type":"code","execution_count":null,"metadata":{"id":"wn3qjpCMh8Lh"},"outputs":[],"source":["# Install Required Libraries\n","!pip install transformers pytesseract pymarc datasets pandas scikit-learn --quiet\n","\n","# Import Necessary Libraries\n","import pandas as pd\n","from sklearn.model_selection import train_test_split\n","from datasets import Dataset\n","from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments\n","from pymarc import Record, Field\n","from google.colab import files\n","\n","# Step 1: Simulate Training Data for Fine-Tuning\n","data = [\n"," {\"input\": \"Extract metadata from: Artificial Intelligence: A Modern Approach by Stuart Russell, 2020\",\n"," \"output\": \"title: Artificial Intelligence: A Modern Approach | author: Stuart Russell | year: 2020\"},\n"," {\"input\": \"Extract metadata from: Deep Learning by Ian Goodfellow, published by MIT Press in 2016\",\n"," \"output\": \"title: Deep Learning | author: Ian Goodfellow | publisher: MIT Press | year: 2016\"},\n"," {\"input\": \"Extract metadata from: Python Crash Course by Eric Matthes, No Starch Press, 2019\",\n"," \"output\": \"title: Python Crash Course | author: Eric Matthes | publisher: No Starch Press | year: 2019\"},\n"," {\"input\": \"Extract metadata from: Clean Code: A Handbook of Agile Software Craftsmanship by Robert C. Martin, 2008\",\n"," \"output\": \"title: Clean Code | author: Robert C. Martin | year: 2008\"}\n","]\n","\n","# Convert to DataFrame\n","df = pd.DataFrame(data)\n","\n","# Split Data into Training and Testing\n","train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)\n","\n","# Convert DataFrames to HuggingFace Datasets\n","train_dataset = Dataset.from_pandas(train_data)\n","test_dataset = Dataset.from_pandas(test_data)\n","\n","# Step 2: Load Pre-Trained T5 Model and Tokenizer\n","model_name = \"t5-small\"\n","tokenizer = T5Tokenizer.from_pretrained(model_name)\n","model = T5ForConditionalGeneration.from_pretrained(model_name)\n","\n","# Tokenization Function\n","def tokenize_data(examples):\n"," inputs = tokenizer(examples[\"input\"], max_length=512, truncation=True, padding=\"max_length\")\n"," labels = tokenizer(examples[\"output\"], max_length=512, truncation=True, padding=\"max_length\")\n"," inputs[\"labels\"] = labels[\"input_ids\"]\n"," return inputs\n","\n","# Apply Tokenization\n","tokenized_train = train_dataset.map(tokenize_data, batched=True)\n","tokenized_test = test_dataset.map(tokenize_data, batched=True)\n","\n","# Step 3: Fine-Tune the Model\n","training_args = TrainingArguments(\n"," output_dir=\"./results\",\n"," evaluation_strategy=\"epoch\",\n"," learning_rate=5e-5,\n"," per_device_train_batch_size=4,\n"," num_train_epochs=3,\n"," weight_decay=0.01,\n"," save_total_limit=2,\n"," logging_dir=\"./logs\"\n",")\n","\n","trainer = Trainer(\n"," model=model,\n"," args=training_args,\n"," train_dataset=tokenized_train,\n"," eval_dataset=tokenized_test,\n"," tokenizer=tokenizer\n",")\n","\n","print(\"Fine-tuning the model...\")\n","trainer.train()\n","\n","# Save Fine-Tuned Model\n","model.save_pretrained(\"./fine_tuned_t5_model\")\n","tokenizer.save_pretrained(\"./fine_tuned_t5_model\")\n","print(\"Fine-tuned model saved to './fine_tuned_t5_model'\")\n","\n","# Step 4: Cataloging Pipeline with Fine-Tuned Model\n","def extract_metadata(text):\n"," \"\"\"\n"," Use fine-tuned model to extract metadata from text.\n"," \"\"\"\n"," inputs = tokenizer(text, return_tensors=\"pt\", max_length=512, truncation=True)\n"," outputs = model.generate(inputs[\"input_ids\"], max_length=128, num_beams=4, early_stopping=True)\n"," result = tokenizer.decode(outputs[0], skip_special_tokens=True)\n","\n"," # Parse metadata into structured format\n"," metadata = {}\n"," for item in result.split('|'):\n"," key, value = item.split(':', 1)\n"," metadata[key.strip()] = value.strip()\n"," return metadata\n","\n","def create_marc21(metadata):\n"," \"\"\"\n"," Generate MARC21 record from extracted metadata.\n"," \"\"\"\n"," try:\n"," record = Record()\n"," record.add_field(Field(tag='245', indicators=['1', '0'], subfields=[\n"," 'a', metadata.get('title', ''),\n"," 'c', metadata.get('author', '')\n"," ]))\n"," record.add_field(Field(tag='264', indicators=['#', '1'], subfields=[\n"," 'b', metadata.get('publisher', ''),\n"," 'c', metadata.get('year', '')\n"," ]))\n"," return record\n"," except Exception as e:\n"," print(f\"Error creating MARC21 record: {e}\")\n"," return None\n","\n","# Full Cataloging Pipeline\n","def catalog_pipeline():\n"," \"\"\"\n"," End-to-end cataloging: text -> metadata -> MARC21.\n"," \"\"\"\n"," print(\"Upload a text file for cataloging:\")\n"," uploaded = files.upload()\n"," file_name = next(iter(uploaded))\n"," with open(file_name, 'r') as file:\n"," input_text = file.read()\n","\n"," # Step 1: Extract Metadata\n"," metadata = extract_metadata(input_text)\n"," print(\"Extracted Metadata:\", metadata)\n","\n"," # Step 2: Create MARC21 Record\n"," marc_record = create_marc21(metadata)\n"," if marc_record:\n"," print(\"Generated MARC21 Record:\")\n"," print(marc_record)\n","\n"," # Step 3: Save MARCXML\n"," output_file = \"output.marcxml\"\n"," with open(output_file, \"wb\") as f:\n"," f.write(marc_record.as_marcxml())\n"," files.download(output_file)\n"," print(\"Pipeline completed. MARCXML file ready for download.\")\n"," else:\n"," print(\"Failed to generate MARC21 record.\")\n","\n","# Run the Cataloging Pipeline\n","catalog_pipeline()\n"]}]}
|