Spaces:

msmaje
/

PhDComputerScienceMultilingualHATASystem

Sleeping

App Files Files Community

PhDComputerScienceMultilingualHATASystem / app.py

msmaje

Create app.py

a1f7a6b verified 9 days ago

raw

history blame

10.5 kB

	"""
	Gradio Space for Human-AI Text Attribution (HATA) Model
	Detects whether text is human-written or AI-generated
	Supports multiple African languages
	"""

	import gradio as gr
	import torch
	from transformers import AutoTokenizer, AutoModelForSequenceClassification
	import numpy as np

	# Load model and tokenizer
	MODEL_NAME = "msmaje/phdhatamodel"

	print("Loading model...")
	tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
	model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
	model.eval()
	print("Model loaded successfully!")

	# Language examples
	EXAMPLES = [
	["Ìwé yìí jẹ́ ìwé tó dára púpọ̀ fún àwọn akẹ́kọ̀ọ́.", "Yoruba"],
	["Wannan littafi mai kyau ne ga ɗalibai.", "Hausa"],
	["Akwụkwọ a dị mma maka ụmụ akwụkwọ.", "Igbo"],
	["Dis book dey very good for students wey wan learn.", "Nigerian Pidgin"],

	]

	def classify_text(text, show_probabilities=True):
	"""
	Classify text as human-written or AI-generated

	Args:
	text: Input text to classify
	show_probabilities: Whether to show probability scores

	Returns:
	Classification result with confidence scores
	"""
	if not text or len(text.strip()) == 0:
	return "⚠️ Please enter some text to classify.", None

	# Tokenize
	inputs = tokenizer(
	text,
	return_tensors="pt",
	truncation=True,
	max_length=128,
	padding=True
	)

	# Get prediction
	with torch.no_grad():
	outputs = model(**inputs)
	probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)
	predicted_class = torch.argmax(probabilities, dim=-1).item()
	confidence = probabilities[0][predicted_class].item()

	# Labels
	labels = {0: "👤 Human-written", 1: "🤖 AI-generated"}

	# Create result text
	result = f"## Prediction: {labels[predicted_class]}\n"
	result += f"Confidence: {confidence:.2%}\n\n"

	# Add interpretation
	if confidence > 0.9:
	result += "✅ High confidence - The model is very certain about this prediction."
	elif confidence > 0.7:
	result += "⚠️ Moderate confidence - The model is fairly certain, but there's some uncertainty."
	else:
	result += "❓ Low confidence - The model is uncertain. The text may have mixed characteristics."

	# Probability chart data
	prob_data = {
	"Human-written": float(probabilities[0][0].item()),
	"AI-generated": float(probabilities[0][1].item())
	}

	if show_probabilities:
	return result, prob_data
	else:
	return result, None

	def batch_classify(file):
	"""
	Classify multiple texts from uploaded file
	"""
	if file is None:
	return "⚠️ Please upload a text file."

	# Read file
	try:
	with open(file.name, 'r', encoding='utf-8') as f:
	texts = f.readlines()
	except Exception as e:
	return f"❌ Error reading file: {e}"

	# Process each text
	results = []
	for i, text in enumerate(texts, 1):
	text = text.strip()
	if not text:
	continue

	inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=128)

	with torch.no_grad():
	outputs = model(**inputs)
	probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)
	predicted_class = torch.argmax(probabilities, dim=-1).item()
	confidence = probabilities[0][predicted_class].item()

	label = "Human" if predicted_class == 0 else "AI"
	results.append(f"{i}. [{label} - {confidence:.2%}] {text[:100]}...")

	return "\n".join(results)

	# Custom CSS
	custom_css = """
	#title {
	text-align: center;
	background: linear-gradient(90deg, #667eea 0%, #764ba2 100%);
	-webkit-background-clip: text;
	-webkit-text-fill-color: transparent;
	font-size: 2.5em;
	font-weight: bold;
	margin-bottom: 0.5em;
	}

	#subtitle {
	text-align: center;
	color: #666;
	font-size: 1.2em;
	margin-bottom: 1em;
	}

	.output-box {
	border: 2px solid #667eea;
	border-radius: 10px;
	padding: 15px;
	}

	.gradio-container {
	max-width: 900px;
	margin: auto;
	}
	"""

	# Create Gradio interface
	with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as demo:

	# Header
	gr.Markdown("<h1 id='title'>🔍 Human vs AI Text Detector</h1>")
	gr.Markdown(
	"<p id='subtitle'>Detect whether text is human-written or AI-generated \| "
	"Supports African Languages 🌍</p>"
	)

	# Main interface
	with gr.Tabs():
	# Tab 1: Single text classification
	with gr.Tab("📝 Single Text"):
	with gr.Row():
	with gr.Column(scale=2):
	text_input = gr.Textbox(
	label="Enter text to classify",
	placeholder="Type or paste your text here...",
	lines=6,
	max_lines=10
	)

	show_probs = gr.Checkbox(
	label="Show probability distribution",
	value=True
	)

	with gr.Row():
	classify_btn = gr.Button("🔍 Classify Text", variant="primary")
	clear_btn = gr.ClearButton([text_input])

	with gr.Column(scale=2):
	result_output = gr.Markdown(label="Result")
	prob_plot = gr.BarPlot(
	x="label",
	y="probability",
	title="Probability Distribution",
	y_lim=[0, 1],
	height=300,
	visible=True
	)

	# Examples
	gr.Markdown("### 📚 Try these examples:")
	gr.Examples(
	examples=EXAMPLES,
	inputs=[text_input],
	label="Example texts in different languages"
	)

	# Connect classification function
	classify_btn.click(
	fn=classify_text,
	inputs=[text_input, show_probs],
	outputs=[result_output, prob_plot]
	)

	# Tab 2: Batch classification
	with gr.Tab("📄 Batch Processing"):
	gr.Markdown("""
	### Upload a text file for batch classification

	Upload a `.txt` file with one text sample per line.
	The app will classify each line and show the results.
	""")

	with gr.Row():
	with gr.Column():
	file_input = gr.File(
	label="Upload text file (.txt)",
	file_types=[".txt"]
	)
	batch_btn = gr.Button("🔍 Classify All", variant="primary")

	with gr.Column():
	batch_output = gr.Textbox(
	label="Batch Results",
	lines=15,
	max_lines=20
	)

	batch_btn.click(
	fn=batch_classify,
	inputs=file_input,
	outputs=batch_output
	)

	# Tab 3: About
	with gr.Tab("ℹ️ About"):
	gr.Markdown("""
	# About This Model

	## 🎯 Purpose
	This model detects whether text is human-written or AI-generated.
	It has been specifically trained on African languages to ensure fair and
	accurate detection across diverse linguistic contexts.

	## 🌍 Supported Languages
	- English
	- Yoruba (yo)
	- Hausa (ha)
	- Igbo (ig)
	- Swahili (sw)
	- Amharic (am)
	- Nigerian Pidgin (pcm)

	## 📊 Performance
	- Accuracy: 100%
	- F1 Score: 100%
	- Fairness Metrics: EOD = 0.0, AAOD = 0.0 (Perfect fairness)

	## 🔬 Model Details
	- Base Model: [AfroXLMR-base](https://huggingface.co/davlan/afro-xlmr-base)
	- Parameters: ~270M (0.3B)
	- Max Sequence Length: 128 tokens
	- Training Dataset: PhD HATA African Dataset

	## ⚖️ Fairness & Ethics
	This model has been trained with explicit fairness constraints to ensure:
	- Equal performance across all supported languages
	- No bias toward high-resource languages
	- Fair treatment of diverse linguistic communities

	## ⚠️ Limitations
	- Performance may vary on languages outside the training distribution
	- AI detection capabilities are tied to the AI systems present in training data
	- Should be used as one component in content verification, not sole determinant
	- Text length and domain may affect accuracy

	## 📚 Citation
	```bibtex
	@misc{msmaje2025hata,
	author = {Maje, M.S.},
	title = {AfroXLMR for Human-AI Text Attribution},
	year = {2025},
	publisher = {HuggingFace},
	url = {https://huggingface.co/msmaje/phdhatamodel}
	}
	```

	## 🔗 Links
	- [Model on HuggingFace](https://huggingface.co/msmaje/phdhatamodel)
	- [Training Visualizations](https://huggingface.co/msmaje/phdhatamodel/tree/main/visualizations)
	- [Dataset](https://huggingface.co/datasets/msmaje/phd-hata-african-dataset)

	## 👤 Contact
	For questions or feedback, please open an issue on the model repository.
	""")

	# Footer
	gr.Markdown("""
	---
	<div style='text-align: center; color: #666; padding: 20px;'>
	<p>Built with 💜 for African Language NLP \| Powered by AfroXLMR</p>
	<p>Model: <a href='https://huggingface.co/msmaje/phdhatamodel'>msmaje/phdhatamodel</a></p>
	</div>
	""")

	# Launch
	if __name__ == "__main__":
	demo.launch()