French-TV-Headline-Classification / inference_classification_transcript.py

Create inference_classification_transcript.py

1b0843f verified over 1 year ago

1.96 kB

	import pandas as pd
	from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
	from tqdm.auto import tqdm

	# Constants
	batch_size = 1000

	# Load tokenizer and model
	model_checkpoint = "PleIAs/French-TV-Headline-Classification"
	tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, model_max_length=512)
	model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint)
	classification_pipeline = pipeline("text-classification", model=model, tokenizer=tokenizer)

	# Read the dataset
	val_classification = pd.read_parquet("[file]")
	val_classification.reset_index(drop=True, inplace=True)

	# Calculate the number of batches needed
	num_batches = (len(val_classification) + batch_size - 1) // batch_size

	# Initialize the list to collect DataFrames
	list_df = []

	for i in tqdm(range(num_batches), desc="Processing batches"):
	start_index = i * batch_size
	end_index = min((i + 1) * batch_size, len(val_classification))
	batch = val_classification.iloc[start_index:end_index]

	# Extract texts from the DataFrame
	texts = batch["corrected_text"].tolist()

	# Classify texts in batches
	classifications = classification_pipeline(texts, truncation=True, padding=True, top_k=None)

	# Prepare data for DataFrame
	rows = []
	for text_index, class_results in enumerate(classifications):
	for entry in class_results:
	rows.append({
	'text_id': start_index + text_index,
	'label': entry['label'],
	'score': round(entry['score'] * 100, 2),
	'identifier': batch.iloc[text_index]['identifier']
	})

	# Create DataFrame from the processed batch
	df = pd.DataFrame(rows)
	list_df.append(df)


	# Concatenate all DataFrames in the list
	final_df = pd.concat(list_df, ignore_index=True)

	print(final_df)

	# Save the resulting DataFrame to a CSV file
	final_df.to_csv("transcript_classification.csv", index=False)