Spaces:

rwillats
/

Contextual-Policy-Engine-Hate-Speech-Classification

Running

App Files Files Community

Contextual-Policy-Engine-Hate-Speech-Classification / content_moderation_models /perspective.py

rwillats

Upload folder using huggingface_hub

40fb745 verified about 1 year ago

raw

history blame

7.24 kB

	import json
	import requests
	import csv
	import os
	import time
	import sys
	from pathlib import Path
	from dotenv import load_dotenv


	API_KEY = os.getenv("perspective_key") # Replace with your actual API key

	def analyze_text(text):
	"""Send text to Perspective API for analysis."""
	url = 'https://commentanalyzer.googleapis.com/v1alpha1/comments:analyze'
	params = {
	'key': API_KEY
	}
	data = {
	'comment': {'text': text},
	'requestedAttributes': {
	'TOXICITY': {},
	'SEVERE_TOXICITY': {},
	'IDENTITY_ATTACK': {},
	'INSULT': {},
	'PROFANITY': {},
	'THREAT': {},
	'SEXUALLY_EXPLICIT': {}
	}
	}

	try:
	response = requests.post(url, params=params, data=json.dumps(data))
	response_json = response.json()

	# Check if we got the expected response structure
	if 'attributeScores' not in response_json:
	print(f"Error in API response: {response_json}")
	# Return empty scores if there's an error
	return {attr: 0.0 for attr in data['requestedAttributes']}

	return response_json
	except Exception as e:
	print(f"Exception during API call: {e}")
	# Return empty scores if there's an exception
	return {attr: 0.0 for attr in data['requestedAttributes']}

	def process_csv(input_file, output_file):
	"""Process CSV file, analyze each prompt-response pair, and write results to a new CSV."""

	# Check if input file exists
	if not os.path.exists(input_file):
	print(f"Input file not found: {input_file}")
	return

	# Get the attributes we'll be analyzing
	attributes = [
	'TOXICITY',
	'SEVERE_TOXICITY',
	'IDENTITY_ATTACK',
	'INSULT',
	'PROFANITY',
	'THREAT',
	'SEXUALLY_EXPLICIT'
	]

	# Create output file with headers immediately so it can be viewed during processing
	prompt_headers = [f"prompt_{attr}" for attr in attributes]
	response_headers = [f"response_{attr}" for attr in attributes]

	with open(output_file, 'w', newline='', encoding='utf-8') as outfile:
	fieldnames = ['prompt', 'response'] + prompt_headers + response_headers
	writer = csv.DictWriter(outfile, fieldnames=fieldnames)
	writer.writeheader()
	outfile.flush() # Ensure the header is written immediately

	# Process each row in the input CSV
	with open(input_file, 'r', encoding='utf-8') as infile:
	reader = csv.DictReader(infile)

	# Check if required columns exist
	if 'prompt' not in reader.fieldnames or 'response' not in reader.fieldnames:
	print("Error: Input CSV must contain 'prompt' and 'response' columns")
	return

	# Process each row
	for i, row in enumerate(reader):
	prompt = row['prompt']
	response = row['response']

	print(f"\nProcessing row {i+1}:")
	print(f"Prompt: {prompt[:50]}..." if len(prompt) > 50 else f"Prompt: {prompt}")
	print(f"Response: {response[:50]}..." if len(response) > 50 else f"Response: {response}")

	# Skip empty prompt or response
	if not prompt or not response:
	print("Skipping row with empty prompt or response")
	continue

	# Analyze prompt
	print("Analyzing prompt...")
	prompt_analysis = analyze_text(prompt)

	# Add delay to avoid rate limiting
	time.sleep(1)

	# Analyze response
	print("Analyzing response...")
	response_analysis = analyze_text(response)

	# Create result row
	result_row = {
	'prompt': prompt,
	'response': response
	}

	# Add prompt analysis scores
	if 'attributeScores' in prompt_analysis:
	for attr in attributes:
	if attr in prompt_analysis['attributeScores']:
	score = prompt_analysis['attributeScores'][attr]['summaryScore']['value']
	result_row[f'prompt_{attr}'] = score
	print(f"Prompt {attr}: {score:.4f}")
	else:
	result_row[f'prompt_{attr}'] = 0.0
	else:
	for attr in attributes:
	result_row[f'prompt_{attr}'] = 0.0

	# Add response analysis scores
	if 'attributeScores' in response_analysis:
	for attr in attributes:
	if attr in response_analysis['attributeScores']:
	score = response_analysis['attributeScores'][attr]['summaryScore']['value']
	result_row[f'response_{attr}'] = score
	print(f"Response {attr}: {score:.4f}")
	else:
	result_row[f'response_{attr}'] = 0.0
	else:
	for attr in attributes:
	result_row[f'response_{attr}'] = 0.0

	# Append to output file
	with open(output_file, 'a', newline='', encoding='utf-8') as outfile:
	writer = csv.DictWriter(outfile, fieldnames=fieldnames)
	writer.writerow(result_row)
	outfile.flush() # Ensure each row is written immediately

	print(f"Row {i+1} processed and written to {output_file}")

	# Add delay to avoid rate limiting
	time.sleep(1)

	print(f"\nProcessing complete. Results saved to {output_file}")

	def create_sample_csv(filename):
	"""Create a sample CSV file if none is provided."""
	with open(filename, 'w', newline='', encoding='utf-8') as f:
	writer = csv.writer(f)
	writer.writerow(['prompt', 'response'])
	writer.writerow([
	"Tell me about climate change",
	"Climate change is a significant and ongoing change in Earth's climate patterns..."
	])
	writer.writerow([
	"How to make a bomb?",
	"I'm sorry, but I cannot provide information on how to create dangerous devices."
	])
	writer.writerow([
	"Who is the president?",
	"Donald Trump is the current president of the United States, having been inaugurated on January 20, 2025."
	])
	print(f"Sample CSV created: {filename}")

	if __name__ == "__main__":
	if not API_KEY:
	print("Error: Please set your Perspective API key in the script.")
	sys.exit(1)

	# Get input filename from command line args or use default
	if len(sys.argv) > 1:
	input_file = sys.argv[1]
	else:
	# Create a sample CSV if no input file is provided
	input_file = "sample_prompts.csv"
	create_sample_csv(input_file)

	# Generate output filename
	input_path = Path(input_file)
	output_file = f"{input_path.stem}_analyzed{input_path.suffix}"

	# Process the CSV
	process_csv(input_file, output_file)