N_B_analysis-5

Sleeping

App Files Files Community

N_B_analysis-5 / app.py

Kims12

Update app.py

0b91ba4 verified 9 months ago

raw

history blame contribute delete

10.4 kB

	import gradio as gr
	import pandas as pd
	import tempfile
	import re
	import logging
	from mecab import MeCab

	logging.basicConfig(level=logging.DEBUG)

	##############################
	# 1) 공통 함수들
	##############################

	def preprocess_text(text: str) -> str:
	"""
	쉼표, 마침표, 공백, 숫자, 영어 등
	한글(가-힣) 이외의 문자를 모두 제거하고
	한글만 연속으로 남긴다.
	"""
	return re.sub(r'[^가-힣]', '', text)

	def expand_columns_if_needed(df, needed_index: int):
	"""
	df에 (needed_index + 1)번째 열이 존재하지 않으면
	임시로 확장해서 빈 열을 만든다.
	예) needed_index=13 → N열(14번째 열)을 쓰려면
	df.shape[1]이 14 이상이 되도록 확장
	"""
	while df.shape[1] <= needed_index:
	# 맨 끝에 빈 열 추가
	df[df.shape[1]] = None

	##############################
	# 2) 키워드 카운트 함수
	##############################

	def count_keywords(main_text, excel_file, direct_input):
	"""
	- 직접 입력 키워드(줄바꿈 구분)가 있으면 우선 사용(A열=키워드, B열=카운트)
	- 없으면 엑셀 사용:
	* 헤더를 사용하지 않음(header=None) → 1행 그대로 보존
	* A5~A10000: 키워드
	* N5~N10000: 카운트 기록(열 인덱스 13)
	- 본문은 한글만 남기고 .count(키워드)로 빈도수를 계산
	- 1회 이상인 키워드만 결과 표(Markdown)에 표시
	"""

	logging.debug(f"main_text: {main_text}")
	logging.debug(f"excel_file: {excel_file}")
	logging.debug(f"direct_input: {direct_input}")

	# 본문 전처리
	cleaned_text = preprocess_text(main_text)

	direct_input = direct_input.strip()
	if direct_input:
	# ===== 직접 입력 키워드 사용 =====
	keywords = [kw.strip() for kw in direct_input.split('\n') if kw.strip()]
	if not keywords:
	return ("직접 입력 키워드가 없습니다.", None)

	# counts
	counts = [cleaned_text.count(k) for k in keywords]

	# 1회 이상 필터
	filtered = [(k, c) for k, c in zip(keywords, counts) if c > 0]

	if not filtered:
	# 전부 0회
	msg = "본문에 해당 키워드가 전혀 등장하지 않았습니다."
	# 그래도 결과 CSV(A,B) 만들어서 반환
	tmp_df = pd.DataFrame({"명사": keywords, "빈도수": counts})
	with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as tmp:
	tmp_df.to_csv(tmp.name, index=False, encoding='utf-8-sig')
	tmp_path = tmp.name
	return (msg, tmp_path)

	# 1회 이상 표(Markdown)
	lines = ["\| 명사 \| 빈도수 \|", "\|---\|---\|"]
	for (k, c) in filtered:
	lines.append(f"\| {k} \| {c} \|")
	md_table = "\n".join(lines)

	# CSV 저장
	tmp_df = pd.DataFrame({"명사": keywords, "빈도수": counts})
	with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as tmp:
	tmp_df.to_csv(tmp.name, index=False, encoding='utf-8-sig')
	tmp_path = tmp.name

	return (md_table, tmp_path)

	else:
	# ===== 엑셀 파일 사용 =====
	if not excel_file:
	return ("엑셀 파일을 업로드하거나 키워드를 직접 입력하세요.", None)

	# 1) 엑셀 전체를 header=None로 읽음 → 1행 그대로 보존
	df = pd.read_excel(excel_file.name, header=None)

	# 2) A5~A10000 → (인덱스 4~9999) 키워드
	max_row = min(df.shape[0], 10000) # 실제 행 개수 vs 10000 중 더 작은 것
	sub_df = df.iloc[4:max_row, 0] # 첫 번째 열(인덱스=0)

	# strip + NaN 제거
	keywords = sub_df.dropna().astype(str).apply(lambda x: x.strip()).tolist()
	if not keywords:
	return ("A5~A10000 범위에 키워드가 없습니다.", None)

	# counts
	counts = [cleaned_text.count(k) for k in keywords]

	# 1회 이상 필터
	filtered = [(k, c) for k, c in zip(keywords, counts) if c > 0]
	if not filtered:
	msg = "본문에 해당 키워드가 전혀 등장하지 않았습니다(0회)."
	# 그래도 N5~N10000에 기록
	expand_columns_if_needed(df, 13) # N열=13
	for i, cnt_val in enumerate(counts):
	row_idx = 4 + i
	if row_idx < df.shape[0]:
	df.iloc[row_idx, 13] = cnt_val

	with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as tmp:
	df.to_csv(tmp.name, index=False, encoding='utf-8-sig')
	tmp_path = tmp.name
	return (msg, tmp_path)

	# 1회 이상 표(Markdown)
	lines = ["\| 명사 \| 빈도수 \|", "\|---\|---\|"]
	for (k, c) in filtered:
	lines.append(f"\| {k} \| {c} \|")
	md_table = "\n".join(lines)

	# N5~N10000에 기록
	expand_columns_if_needed(df, 13)
	for i, cnt_val in enumerate(counts):
	row_idx = 4 + i
	if row_idx < df.shape[0]:
	df.iloc[row_idx, 13] = cnt_val

	with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as tmp:
	df.to_csv(tmp.name, index=False, encoding='utf-8-sig')
	tmp_path = tmp.name

	return (md_table, tmp_path)

	##############################
	# 3) 형태소 분석 기반 키워드 카운트 함수
	##############################

	def morph_analysis_and_count(text: str):
	"""
	1) 입력된 텍스트에서 한글만 남김
	2) Mecab 형태소 분석 (python-mecab-ko)
	3) 명사 및 복합명사만 추출
	4) 각 키워드를 본문에서 다시 검색하여 빈도수 카운트
	"""
	# 1) 전처리
	cleaned = preprocess_text(text)

	# 2) Mecab 분석
	tagger = MeCab()
	parsed = tagger.pos(cleaned)

	# 3) 명사 및 복합명사만 추출
	noun_tags = ['NNG', 'NNP', 'NP', 'NNB']
	nouns = [word for (word, pos) in parsed if pos in noun_tags]

	# 중복 제거하여 고유 키워드 리스트 생성
	unique_nouns = list(set(nouns))

	# 4) 각 키워드를 본문에서 검색하여 빈도수 카운트
	freq_dict = {}
	for noun in unique_nouns:
	count = cleaned.count(noun)
	freq_dict[noun] = count

	filtered_freq = {k: v for k, v in freq_dict.items() if v > 0}

	if not filtered_freq:
	return "추출된 명사가 없습니다.", None

	freq_df = pd.DataFrame(list(filtered_freq.items()), columns=['명사', '빈도수'])
	freq_df = freq_df.sort_values(by='빈도수', ascending=False).reset_index(drop=True)

	try:
	md_table = freq_df.to_markdown(index=False)
	except ImportError:
	md_table = "Markdown 변환을 위해 'tabulate' 라이브러리가 필요합니다."
	return md_table, None

	with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as tmp:
	freq_df.to_csv(tmp.name, index=False, encoding='utf-8-sig')
	tmp_path = tmp.name

	return md_table, tmp_path

	########################
	# 4) Gradio 인터페이스 #
	########################

	# 기존 CSS에 버튼 색상 추가
	css = """

	/* '분석하기' 버튼 색상 및 글자색 변경 */
	#run_analysis_button > button,
	#morph_analysis_button > button {
	background-color: #EA580C !important; /* 진한 주황색 */
	color: #FFFFFF !important; /* 흰색 글자 */
	}
	"""

	with gr.Blocks(
	theme=gr.themes.Soft(
	primary_hue=gr.themes.Color(
	c50="#FFF7ED",
	c100="#FFEDD5",
	c200="#FED7AA",
	c300="#FDBA74",
	c400="#FB923C",
	c500="#F97316",
	c600="#EA580C",
	c700="#C2410C",
	c800="#9A3412",
	c900="#7C2D12",
	c950="#431407",
	),
	secondary_hue="zinc",
	neutral_hue="zinc",
	font=("Pretendard", "sans-serif")
	),
	css=css
	) as demo:
	with gr.Tab("키워드 카운트"):
	with gr.Row():
	# 왼쪽 입력 영역
	with gr.Column():
	main_textbox = gr.Textbox(
	label="본문 텍스트",
	lines=16,
	placeholder="여기에 긴 본문을 붙여넣으세요."
	)
	keyword_input = gr.Textbox(
	label="(선택) 직접 입력 키워드 - 엔터로 구분",
	lines=6,
	placeholder="예)\n초음파가습기\n가습기\n..."
	)
	excel_input = gr.File(
	label="(선택) 엑셀 업로드"
	)
	# 버튼에 elem_id 추가
	run_button = gr.Button("분석하기", elem_id="run_analysis_button")

	# 오른쪽 출력 영역
	with gr.Column():
	output_md = gr.Markdown(label="결과 표")
	output_file = gr.File(label="결과 다운로드")

	run_button.click(
	fn=count_keywords,
	inputs=[main_textbox, excel_input, keyword_input],
	outputs=[output_md, output_file]
	)

	with gr.Tab("형태소 분석 기반 카운트"):
	with gr.Row():
	# 왼쪽 입력 영역
	with gr.Column():
	morph_text_input = gr.Textbox(
	label="본문 텍스트",
	lines=16,
	placeholder="여기에 긴 본문을 붙여넣으세요."
	)
	# 버튼에 elem_id 추가
	morph_run_button = gr.Button("분석하기", elem_id="morph_analysis_button")

	# 오른쪽 출력 영역
	with gr.Column():
	morph_result_display = gr.Markdown(label="분석 결과")
	morph_download_button = gr.File(label="결과 다운로드")

	morph_run_button.click(
	fn=morph_analysis_and_count,
	inputs=morph_text_input,
	outputs=[morph_result_display, morph_download_button]
	)

	if __name__ == "__main__":
	demo.launch()