Spaces:
Sleeping
Sleeping
| import spacy | |
| from spacy.training import Example | |
| import jsonlines | |
| import random | |
| # Load a blank English model | |
| nlp = spacy.blank("en") | |
| # Add text classification pipeline to the model | |
| textcat = nlp.add_pipe('textcat_multilabel', last=True) | |
| textcat.add_label("CapitalRequirements") | |
| textcat.add_label("ConsumerProtection") | |
| textcat.add_label("RiskManagement") | |
| textcat.add_label("ReportingAndCompliance") | |
| textcat.add_label("CorporateGovernance") | |
| # Path to the processed data file | |
| processed_data_file = "data/firstStep_file.jsonl" | |
| # Open the JSONL file and extract text and labels | |
| with jsonlines.open(processed_data_file) as reader: | |
| processed_data = list(reader) | |
| # Convert processed data to spaCy format | |
| spacy_train_data = [] | |
| for obj in processed_data: | |
| text = obj["text"] | |
| label = { | |
| "CapitalRequirements": obj["label"] == "CapitalRequirements", | |
| "ConsumerProtection": obj["label"] == "ConsumerProtection", | |
| "RiskManagement": obj["label"] == "RiskManagement", | |
| "ReportingAndCompliance": obj["label"] == "ReportingAndCompliance", | |
| "CorporateGovernance": obj["label"] == "CorporateGovernance" | |
| } | |
| spacy_train_data.append(Example.from_dict(nlp.make_doc(text), {"cats": label})) | |
| # Initialize the model and get the optimizer | |
| optimizer = nlp.initialize() | |
| # Train the text classification model | |
| n_iter = 10 | |
| for i in range(n_iter): | |
| spacy.util.fix_random_seed(1) | |
| random.shuffle(spacy_train_data) | |
| losses = {} | |
| for batch in spacy.util.minibatch(spacy_train_data, size=8): | |
| nlp.update(batch, losses=losses, sgd=optimizer) | |
| print("Iteration:", i, "Losses:", losses) | |
| # Save the trained model | |
| output_dir = "./my_trained_model" | |
| nlp.to_disk(output_dir) | |