Spaces:

Rathapoom
/

thai-ner-demo

Runtime error

App Files Files Community

thai-ner-demo / app.py

Rathapoom

Update app.py

f5a7ece verified 7 months ago

raw

history blame contribute delete

5.36 kB

	import gradio as gr
	import pandas as pd
	from transformers import pipeline
	import re
	import os

	# 1. โหลดโมเดล NER (เหมือนเดิม)
	print("กำลังโหลดโมเดล...")
	hf_token = os.getenv("HF_TOKEN")
	ner_pipeline = pipeline(
	"token-classification",
	model="loolootech/no-name-ner-th",
	device=-1,
	token=hf_token
	)
	print("โมเดลพร้อมใช้งานแล้ว")


	# 2. ฟังก์ชันสำหรับรวม Token (เหมือนเดิม)
	def merge_entities(ner_results):
	merged_entities = []
	current_entity = None
	for entity in ner_results:
	entity_type = re.sub(r'^[BI]-', '', entity['entity'])
	if current_entity and entity['start'] == current_entity['end'] and entity_type == current_entity['type']:
	current_entity['word'] += entity['word']
	current_entity['end'] = entity['end']
	current_entity['score'] = max(current_entity['score'], entity['score'])
	else:
	if current_entity:
	merged_entities.append(current_entity)
	current_entity = {
	'type': entity_type, 'word': entity['word'],
	'start': entity['start'], 'end': entity['end'], 'score': entity['score']
	}
	if current_entity:
	merged_entities.append(current_entity)
	return merged_entities


	# 3. ฟังก์ชันสำหรับ De-identification ของข้อความ 1 บรรทัด (เหมือนเดิม)
	def deidentify_single_text(text):
	if pd.isna(text) or not isinstance(text, str) or not text.strip():
	return ""

	ner_results = ner_pipeline(text)
	merged = merge_entities(ner_results)

	redacted_text = text
	for entity in reversed(merged):
	start, end, label = entity['start'], entity['end'], entity['type']
	redacted_text = redacted_text[:start] + f"[{label}]" + redacted_text[end:]

	return redacted_text


	# 4. [อัปเดต] ฟังก์ชันสำหรับประมวลผลไฟล์ (ไม่ต้องรับชื่อคอลัมน์แล้ว)
	def process_entire_file(uploaded_file, progress=gr.Progress(track_tqdm=True)):
	if uploaded_file is None:
	raise gr.Error("กรุณาอัปโหลดไฟล์ก่อน")

	file_path = uploaded_file.name

	# อ่านไฟล์ด้วย Pandas
	try:
	if file_path.endswith('.csv'):
	df = pd.read_csv(file_path)
	elif file_path.endswith(('.xlsx', '.xls')):
	df = pd.read_excel(file_path)
	else:
	raise gr.Error("ไฟล์ไม่รองรับ กรุณาอัปโหลด .csv หรือ .xlsx เท่านั้น")
	except Exception as e:
	raise gr.Error(f"ไม่สามารถอ่านไฟล์ได้: {e}")

	# สร้าง DataFrame ใหม่สำหรับเก็บผลลัพธ์
	df_redacted = df.copy()

	# [Key Change] ค้นหาคอลัมน์ทั้งหมดที่มีข้อมูลเป็นประเภทข้อความ (object)
	text_columns = df.select_dtypes(include=['object']).columns

	if len(text_columns) == 0:
	raise gr.Error("ไม่พบคอลัมน์ที่เป็นข้อมูลประเภทข้อความ (text) ในไฟล์นี้เลย")

	# วนลูปและประมวลผลทุกคอลัมน์ที่หาเจอ
	print(f"กำลังประมวลผลคอลัมน์: {list(text_columns)}")
	for col_name in progress.tqdm(text_columns, desc="Processing text columns"):
	df_redacted[col_name] = df[col_name].astype(str).apply(deidentify_single_text)

	# สร้างไฟล์ผลลัพธ์เพื่อให้ผู้ใช้ดาวน์โหลด
	output_filepath = "processed_output_full.csv"
	df_redacted.to_csv(output_filepath, index=False, encoding='utf-8-sig')

	return df_redacted, output_filepath


	# 5. [อัปเดต] สร้างหน้าเว็บ Gradio (ตัดช่องใส่ชื่อคอลัมน์ออก)
	iface = gr.Interface(
	fn=process_entire_file,
	inputs=[
	gr.File(label="อัปโหลดไฟล์ CSV หรือ Excel ที่ต้องการตรวจสอบทั้งตาราง", file_types=[".csv", ".xlsx", ".xls"])
	],
	outputs=[
	gr.DataFrame(label="ตารางผลลัพธ์ (Output Table Preview)", wrap=True, max_rows=10),
	gr.File(label="ดาวน์โหลดผลลัพธ์ (Download Result as CSV)")
	],
	title="📁 Automatic Table De-identification",
	description="อัปโหลดไฟล์ตาราง (CSV, Excel) แล้วระบบจะค้นหาคอลัมน์ที่เป็น 'ข้อความ' ทั้งหมดโดยอัตโนมัติ และทำการปกปิดข้อมูลส่วนบุคคลให้ทันที",
	allow_flagging="never"
	)

	# รันแอป
	iface.launch()