Spaces:

JangTaeng
/

BERT

Sleeping

App Files Files Community

BERT / app.py

JangTaeng

Upload 7 files

05c8e16 verified 11 days ago

raw

history blame contribute delete

21.1 kB

	"""
	BERT 인터랙티브 데모 (한글 버전)
	================================
	Devlin et al. (2019), "BERT: Pre-training of Deep Bidirectional Transformers
	for Language Understanding" (arXiv:1810.04805) 논문의 태스크들을 재현합니다.

	데모는 논문의 구성을 따라 여섯 개 탭으로 나뉘어 있습니다:

	사전학습 태스크 (논문 §3.1)
	1. Masked Language Model -- 핵심 양방향 목적함수
	2. Next Sentence Prediction -- IsNext / NotNext 이진 태스크

	파인튜닝 태스크 (논문 §4, Figure 4)
	3. 문장 쌍 분류 -- MNLI / RTE / MRPC 계열 (Figure 4a)
	4. 단일 문장 분류 -- SST-2 / CoLA 계열 (Figure 4b)
	5. 질의응답 -- SQuAD v1.1 계열 (Figure 4c)
	6. 개체명 인식 -- CoNLL-2003 계열 (Figure 4d)

	로컬 실행:
	pip install -r requirements.txt
	python app.py
	"""

	from __future__ import annotations

	import gradio as gr
	import torch
	from transformers import (
	AutoTokenizer,
	AutoModelForNextSentencePrediction,
	pipeline,
	)

	DEVICE = 0 if torch.cuda.is_available() else -1


	# ---------------------------------------------------------------------------
	# 모델 로딩
	# ---------------------------------------------------------------------------
	# 콜드 스타트 시간을 줄이기 위해 첫 호출 시점에 lazy-load 합니다.
	# 각 함수는 module-level dict에 파이프라인을 캐싱합니다.

	_pipelines: dict[str, object] = {}


	def get_pipeline(name: str):
	"""파이프라인을 캐시에서 반환하거나 처음 호출 시 빌드합니다."""
	if name in _pipelines:
	return _pipelines[name]

	if name == "mlm":
	# §3.1 Task #1: 양방향 문맥으로부터 마스킹된 토큰 예측
	_pipelines[name] = pipeline(
	"fill-mask",
	model="bert-base-uncased",
	device=DEVICE,
	top_k=5,
	)

	elif name == "nsp":
	# §3.1 Task #2: NSP 헤드는 bert-base-uncased에 포함되어 있습니다.
	tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
	model = AutoModelForNextSentencePrediction.from_pretrained("bert-base-uncased")
	model.eval()
	if DEVICE >= 0:
	model = model.cuda()
	_pipelines[name] = (tokenizer, model)

	elif name == "mnli":
	# §4.1: MNLI는 GLUE 중 가장 큰 태스크 (392k 예제)
	_pipelines[name] = pipeline(
	"text-classification",
	model="textattack/bert-base-uncased-MNLI",
	device=DEVICE,
	)

	elif name == "sst2":
	# §4.1: SST-2는 이진 감성 분류 태스크
	_pipelines[name] = pipeline(
	"text-classification",
	model="textattack/bert-base-uncased-SST-2",
	device=DEVICE,
	)

	elif name == "squad":
	# §4.2: SQuAD v1.1 - 추출형 QA, start/end span 예측
	_pipelines[name] = pipeline(
	"question-answering",
	model="bert-large-uncased-whole-word-masking-finetuned-squad",
	device=DEVICE,
	)

	elif name == "ner":
	# §5.3: CoNLL-2003 NER - 토큰 단위 태깅
	_pipelines[name] = pipeline(
	"token-classification",
	model="dslim/bert-base-NER",
	aggregation_strategy="simple",
	device=DEVICE,
	)

	else:
	raise ValueError(f"알 수 없는 파이프라인입니다: {name}")

	return _pipelines[name]


	# ---------------------------------------------------------------------------
	# 1. Masked Language Model (논문 §3.1, Task #1)
	# ---------------------------------------------------------------------------

	def run_mlm(text: str) -> str:
	"""양방향 문맥을 이용해 [MASK] 토큰을 예측합니다."""
	if not text or "[MASK]" not in text:
	return "❗ 문장에 `[MASK]` 토큰을 정확히 하나 포함시켜 주세요."

	nlp = get_pipeline("mlm")
	predictions = nlp(text)

	# 마스크가 여러 개일 때는 list-of-lists로 반환됩니다.
	# 단일 마스크 케이스를 단순화해서 UI를 일관되게 유지합니다.
	if isinstance(predictions[0], list):
	predictions = predictions[0]

	lines = [
	f"입력 문장: `{text}`",
	"",
	"상위 5개 예측 (softmax 확률 기준):",
	"",
	]
	for i, pred in enumerate(predictions, 1):
	score = pred["score"]
	token = pred["token_str"]
	sequence = pred["sequence"]
	lines.append(f"{i}. {token} — 확률 `{score:.4f}`")
	lines.append(f" → {sequence}")
	return "\n".join(lines)


	# ---------------------------------------------------------------------------
	# 2. Next Sentence Prediction (논문 §3.1, Task #2)
	# ---------------------------------------------------------------------------

	def run_nsp(sentence_a: str, sentence_b: str) -> str:
	"""문장 B가 문장 A 다음에 실제로 등장하는지 분류합니다."""
	if not sentence_a.strip() or not sentence_b.strip():
	return "❗ 문장 A와 문장 B를 모두 입력해 주세요."

	tokenizer, model = get_pipeline("nsp")
	# 토크나이저가 자동으로 [CLS] A [SEP] B [SEP] 형태와 segment ID를 만들어 줍니다.
	# 논문 Figure 2의 입력 표현과 일치합니다.
	inputs = tokenizer(sentence_a, sentence_b, return_tensors="pt", truncation=True)
	if DEVICE >= 0:
	inputs = {k: v.cuda() for k, v in inputs.items()}

	with torch.no_grad():
	outputs = model(**inputs)

	# HuggingFace BERT NSP 헤드: 라벨 0 = IsNext, 라벨 1 = NotNext
	probs = torch.softmax(outputs.logits, dim=-1).squeeze().tolist()
	is_next_prob, not_next_prob = probs[0], probs[1]
	verdict = "✅ IsNext (이어지는 문장)" if is_next_prob > not_next_prob else "❌ NotNext (관련 없는 문장)"

	return (
	f"문장 A: {sentence_a}\n\n"
	f"문장 B: {sentence_b}\n\n"
	f"예측 결과: {verdict}\n\n"
	f"- P(IsNext) = `{is_next_prob:.4f}`\n"
	f"- P(NotNext) = `{not_next_prob:.4f}`"
	)


	# ---------------------------------------------------------------------------
	# 3. 문장 쌍 분류 (논문 §4.1, Figure 4a)
	# ---------------------------------------------------------------------------

	def run_mnli(premise: str, hypothesis: str) -> str:
	"""MNLI: 전제가 가설을 함의/모순/중립으로 만드는지 분류"""
	if not premise.strip() or not hypothesis.strip():
	return "❗ 전제(premise)와 가설(hypothesis)을 모두 입력해 주세요."

	nlp = get_pipeline("mnli")
	# MNLI는 3-way 분류: entailment(함의) / neutral(중립) / contradiction(모순)
	# 이 textattack 체크포인트는 [SEP]로 두 문장을 연결한 형식을 받습니다.
	result = nlp(f"{premise} [SEP] {hypothesis}")[0]
	label = result["label"]
	score = result["score"]

	# 라벨을 한국어로 표시
	label_kor = {
	"entailment": "entailment (함의)",
	"neutral": "neutral (중립)",
	"contradiction": "contradiction (모순)",
	"LABEL_0": "contradiction (모순)",
	"LABEL_1": "entailment (함의)",
	"LABEL_2": "neutral (중립)",
	}.get(label, label)

	return (
	f"전제 (Premise): {premise}\n\n"
	f"가설 (Hypothesis): {hypothesis}\n\n"
	f"예측 결과: `{label_kor}` (확신도 `{score:.4f}`)"
	)


	# ---------------------------------------------------------------------------
	# 4. 단일 문장 분류 (논문 §4.1, Figure 4b)
	# ---------------------------------------------------------------------------

	def run_sst2(text: str) -> str:
	"""SST-2 이진 감성 분류"""
	if not text.strip():
	return "❗ 문장을 입력해 주세요."

	nlp = get_pipeline("sst2")
	result = nlp(text)[0]
	label_map = {"LABEL_0": "😞 부정 (Negative)", "LABEL_1": "😀 긍정 (Positive)"}
	label = label_map.get(result["label"], result["label"])

	return (
	f"입력 문장: {text}\n\n"
	f"감성: {label}\n\n"
	f"확신도: `{result['score']:.4f}`"
	)


	# ---------------------------------------------------------------------------
	# 5. 질의응답 (논문 §4.2, Figure 4c)
	# ---------------------------------------------------------------------------

	def run_squad(context: str, question: str) -> str:
	"""추출형 QA: context 안에서 답변 span을 찾아 반환"""
	if not context.strip() or not question.strip():
	return "❗ 지문(context)과 질문(question)을 모두 입력해 주세요."

	nlp = get_pipeline("squad")
	# 논문에서는 QA를 passage 토큰들에 대한 시작 토큰 S와 끝 토큰 E의 분포 예측으로
	# 정의합니다 (§4.2 참고). 파이프라인이 이 과정을 감싸줍니다.
	result = nlp(question=question, context=context)

	return (
	f"질문: {question}\n\n"
	f"답변: {result['answer']}\n\n"
	f"- 답변 위치: 문자 `{result['start']}`–`{result['end']}`\n"
	f"- 확신도 점수: `{result['score']:.4f}`"
	)


	# ---------------------------------------------------------------------------
	# 6. 개체명 인식 (논문 §5.3, Figure 4d)
	# ---------------------------------------------------------------------------

	def run_ner(text: str) -> str:
	"""CoNLL-2003 NER: 각 토큰을 PER/ORG/LOC/MISC로 태깅"""
	if not text.strip():
	return "❗ 분석할 문장을 입력해 주세요."

	nlp = get_pipeline("ner")
	entities = nlp(text)

	if not entities:
	return f"입력 문장: {text}\n\n_검출된 개체가 없습니다._"

	# 개체 유형을 한국어 설명과 함께 표시
	entity_kor = {
	"PER": "인물 (PER)",
	"ORG": "조직 (ORG)",
	"LOC": "장소 (LOC)",
	"MISC": "기타 (MISC)",
	}

	lines = [f"입력 문장: {text}", "", "검출된 개체:", ""]
	for ent in entities:
	kor_label = entity_kor.get(ent["entity_group"], ent["entity_group"])
	lines.append(
	f"- {ent['word']} → `{kor_label}` "
	f"(점수 `{ent['score']:.3f}`, 문자 {ent['start']}–{ent['end']})"
	)
	return "\n".join(lines)


	# ---------------------------------------------------------------------------
	# Gradio UI
	# ---------------------------------------------------------------------------

	INTRO_MD = """
	# 🤖 BERT 인터랙티브 데모

	이 Space는 *Devlin et al. (2019), BERT: Pre-training of Deep Bidirectional Transformers
	for Language Understanding*** ([arXiv:1810.04805](https://arxiv.org/abs/1810.04805)) 논문의 실험들을
	직접 체험해보는 페이지입니다.

	처음 두 탭은 논문의 사전학습 목적함수(§3.1) 를 보여줍니다:
	- Masked LM — 무작위로 15% 토큰을 가리고 양방향 문맥으로 예측
	- Next Sentence Prediction — 문장 B가 문장 A 다음에 오는지 판단

	나머지 네 탭은 Figure 4의 파인튜닝 태스크 카테고리를 다룹니다:
	- (a) 문장 쌍 분류 — MNLI
	- (b) 단일 문장 분류 — SST-2
	- (c) 질의응답 — SQuAD v1.1
	- (d) 단일 문장 태깅 — CoNLL-2003 NER

	> 💡 각 탭을 처음 사용할 때 사전학습 체크포인트가 Hub에서 다운로드됩니다(약 400 MB – 1.3 GB).
	> 그 이후 호출은 빠릅니다.
	>
	> 📝 사용된 모델은 모두 영어 BERT입니다. 한국어 BERT로 바꾸려면 `klue/bert-base` 등으로 모델 ID를 교체하세요.
	"""


	def build_ui() -> gr.Blocks:
	with gr.Blocks(title="BERT 데모", theme=gr.themes.Soft()) as demo:
	gr.Markdown(INTRO_MD)

	with gr.Tabs():
	# ----- 탭 1: Masked LM -----
	with gr.Tab("1️⃣ Masked LM (사전학습)"):
	gr.Markdown(
	"문장 어디에든 `[MASK]` 토큰을 넣어보세요. 모델이 왼쪽과 오른쪽 문맥을 모두 "
	"사용하여 그 자리에 올 가장 가능성 높은 단어들을 예측합니다."
	)
	mlm_in = gr.Textbox(
	label="[MASK]가 포함된 문장",
	value="The capital of France is [MASK].",
	lines=2,
	)
	mlm_out = gr.Markdown()
	gr.Button("예측하기", variant="primary").click(run_mlm, mlm_in, mlm_out)
	gr.Examples(
	examples=[
	"The capital of France is [MASK].",
	"I went to the bank to deposit my [MASK].",
	"Albert Einstein was a famous [MASK].",
	"She opened the door and [MASK] inside.",
	],
	inputs=mlm_in,
	label="예시 (클릭해서 사용)",
	)

	# ----- 탭 2: NSP -----
	with gr.Tab("2️⃣ Next Sentence Prediction (사전학습)"):
	gr.Markdown(
	"두 문장이 주어졌을 때, 두 번째 문장이 첫 번째 문장 뒤에 실제로 이어지는 "
	"문장인지(`IsNext`), 아니면 코퍼스에서 무작위로 뽑힌 관련 없는 문장인지"
	"(`NotNext`)를 판단합니다."
	)
	with gr.Row():
	nsp_a = gr.Textbox(label="문장 A", value="The man went to the store.", lines=2)
	nsp_b = gr.Textbox(label="문장 B", value="He bought a gallon of milk.", lines=2)
	nsp_out = gr.Markdown()
	gr.Button("예측하기", variant="primary").click(run_nsp, [nsp_a, nsp_b], nsp_out)
	gr.Examples(
	examples=[
	["The man went to the store.", "He bought a gallon of milk."],
	["The man went to the store.", "Penguins are flightless birds."],
	["She studied all night for the exam.", "She felt confident the next morning."],
	],
	inputs=[nsp_a, nsp_b],
	label="예시 (클릭해서 사용)",
	)

	# ----- 탭 3: 문장 쌍 분류 (MNLI) -----
	with gr.Tab("3️⃣ 문장 쌍 분류 — MNLI"):
	gr.Markdown(
	"Multi-Genre Natural Language Inference (MNLI). "
	"전제(premise) 와 가설(hypothesis) 이 주어지면, 둘의 관계를 "
	"함의(entailment), 중립(neutral), 모순(contradiction) 중 하나로 분류합니다. "
	"논문 Figure 4(a)에 해당합니다."
	)
	with gr.Row():
	mnli_p = gr.Textbox(
	label="전제 (Premise)",
	value="A man inspects the uniform of a figure in some East Asian country.",
	lines=2,
	)
	mnli_h = gr.Textbox(label="가설 (Hypothesis)", value="The man is sleeping.", lines=2)
	mnli_out = gr.Markdown()
	gr.Button("분류하기", variant="primary").click(run_mnli, [mnli_p, mnli_h], mnli_out)
	gr.Examples(
	examples=[
	["A soccer game with multiple males playing.", "Some men are playing a sport."],
	["A man is playing a guitar.", "A man is sleeping."],
	["The dog is running through the field.", "The animal is moving."],
	],
	inputs=[mnli_p, mnli_h],
	label="예시 (클릭해서 사용)",
	)

	# ----- 탭 4: 단일 문장 분류 (SST-2) -----
	with gr.Tab("4️⃣ 단일 문장 분류 — SST-2"):
	gr.Markdown(
	"Stanford Sentiment Treebank (SST-2). 영화 리뷰 문장의 감성을 "
	"긍정 / 부정 이진 분류합니다. 논문 Figure 4(b)에 해당합니다."
	)
	sst_in = gr.Textbox(
	label="문장",
	value="This movie was absolutely fantastic — I loved every minute of it.",
	lines=2,
	)
	sst_out = gr.Markdown()
	gr.Button("감성 분석", variant="primary").click(run_sst2, sst_in, sst_out)
	gr.Examples(
	examples=[
	"This movie was absolutely fantastic — I loved every minute of it.",
	"What a complete waste of time and money.",
	"The cinematography was breathtaking and the score was sublime.",
	"I have never been so bored in my entire life.",
	],
	inputs=sst_in,
	label="예시 (클릭해서 사용)",
	)

	# ----- 탭 5: 질의응답 (SQuAD) -----
	with gr.Tab("5️⃣ 질의응답 — SQuAD v1.1"):
	gr.Markdown(
	"Stanford Question Answering Dataset (SQuAD v1.1). 지문과 질문이 주어지면, "
	"모델이 지문 안에서 답이 시작되는 위치와 끝나는 위치, 즉 답변 span 을 예측합니다. "
	"논문 Figure 4(c)에 해당합니다."
	)
	squad_ctx = gr.Textbox(
	label="지문 (Context)",
	value=(
	"BERT was introduced by researchers at Google AI Language in "
	"October 2018. It stands for Bidirectional Encoder Representations "
	"from Transformers and is pre-trained on the BooksCorpus (800M "
	"words) and English Wikipedia (2,500M words). BERT-Large has 340 "
	"million parameters."
	),
	lines=6,
	)
	squad_q = gr.Textbox(label="질문 (Question)", value="How many parameters does BERT-Large have?")
	squad_out = gr.Markdown()
	gr.Button("답변 찾기", variant="primary").click(run_squad, [squad_ctx, squad_q], squad_out)
	gr.Examples(
	examples=[
	[
	"BERT was introduced by researchers at Google AI Language in October 2018. It stands for Bidirectional Encoder Representations from Transformers and is pre-trained on the BooksCorpus (800M words) and English Wikipedia (2,500M words). BERT-Large has 340 million parameters.",
	"When was BERT introduced?",
	],
	[
	"BERT was introduced by researchers at Google AI Language in October 2018. It stands for Bidirectional Encoder Representations from Transformers and is pre-trained on the BooksCorpus (800M words) and English Wikipedia (2,500M words). BERT-Large has 340 million parameters.",
	"What does BERT stand for?",
	],
	],
	inputs=[squad_ctx, squad_q],
	label="예시 (클릭해서 사용)",
	)

	# ----- 탭 6: NER -----
	with gr.Tab("6️⃣ 개체명 인식 — CoNLL-2003"):
	gr.Markdown(
	"CoNLL-2003 NER. 각 토큰을 인물(PER), 조직(ORG), 장소(LOC), "
	"기타(MISC) 카테고리로 분류하는 토큰 단위 태깅 태스크입니다. "
	"논문 Figure 4(d)에 해당합니다."
	)
	ner_in = gr.Textbox(
	label="문장",
	value="Jacob Devlin works at Google in Mountain View, California.",
	lines=2,
	)
	ner_out = gr.Markdown()
	gr.Button("개체 인식", variant="primary").click(run_ner, ner_in, ner_out)
	gr.Examples(
	examples=[
	"Jacob Devlin works at Google in Mountain View, California.",
	"Apple CEO Tim Cook announced the new iPhone in Cupertino.",
	"Angela Merkel met Emmanuel Macron in Berlin last Tuesday.",
	],
	inputs=ner_in,
	label="예시 (클릭해서 사용)",
	)

	gr.Markdown(
	"---\n"
	"📄 논문: [arXiv:1810.04805](https://arxiv.org/abs/1810.04805) • "
	"💻 원 저자 코드: [google-research/bert](https://github.com/google-research/bert) • "
	"🤗 모델: [bert-base-uncased](https://huggingface.co/bert-base-uncased)"
	)

	return demo


	if __name__ == "__main__":
	build_ui().launch()