Spaces:

SonFox2920
/

Vnese_crawl

Build error

App Files Files Community

Vnese_crawl / app.py

SonFox2920

Update app.py

c375407 verified about 1 year ago

raw

history blame contribute delete

40.6 kB

	# Import custom modules for prediction
	from predictor import predict
	import google.generativeai as genai
	import streamlit as st
	import pandas as pd
	import wikipedia
	import json
	import regex as re
	from typing import List, Dict, Any
	import google.generativeai as genai
	import warnings
	import logging
	import time
	import random

	# Disable warnings and logging
	warnings.filterwarnings('ignore')
	logging.disable(logging.WARNING)
	# Set Wikipedia language to Vietnamese
	wikipedia.set_lang("vi")

	# Character to number mapping for Vietnamese alphabet
	char_to_num = {
	'A': 1, 'Ă': 2, 'Â': 3, 'B': 4, 'C': 5, 'D': 6, 'Đ': 7, 'E': 8, 'Ê': 9, 'G': 10,
	'H': 11, 'I': 12, 'K': 13, 'L': 14, 'M': 15, 'N': 16, 'O': 17, 'Ô': 18, 'Ơ': 19,
	'P': 20, 'Q': 21, 'R': 22, 'S': 23, 'T': 24, 'U': 25, 'Ư': 26, 'V': 27, 'X': 28, 'Y': 29
	}

	class ClaimGenerator:
	"""Class to handle advanced claim generation with different APIs."""

	def __init__(self, claim_type: str):
	self.claim_type = claim_type
	self.API_KEYS = {
	"SUPPORTED": "AIzaSyCAMvWn4npcsO6ypWWB2cEnQQKuLIcB8JA",
	"REFUTED": "AIzaSyAn1E_e4iAP-zDUV1R7hQa3DCkRfAOxW_8",
	"NEI": "AIzaSyAciDaUfIhcK32qDhmozXAOKdSjgQeOSis"
	}
	self.current_key = self.API_KEYS[self.claim_type]
	self._configure_api()
	self.used_patterns = set()

	# Add prompt variations
	self.prompt_variations = [
	"Tạo câu claim {0} sử dụng cấu trúc phức hợp.",
	"Tạo câu claim {0} sử dụng phép ẩn dụ hoặc so sánh.",
	"Tạo câu claim {0} sử dụng cấu trúc đối chiếu.",
	"Tạo câu claim {0} mô tả mối quan hệ nhân quả.",
	"Tạo câu claim {0} dùng lối diễn đạt gián tiếp."
	]

	# Add diversity instructions
	self.diversity_instructions = [
	"""
	QUAN TRỌNG: Mỗi câu claim phải có cấu trúc câu khác nhau:
	- Câu 1: Bắt đầu với chủ ngữ.
	- Câu 2: Bắt đầu với trạng ngữ thời gian hoặc địa điểm.
	- Câu 3: Bắt đầu với mệnh đề phụ.
	- Câu 4: Bắt đầu với câu hỏi gián tiếp hoặc dẫn nhập.
	- Câu 5: Sử dụng cấu trúc so sánh.

	Tránh lặp lại cấu trúc câu, từ ngữ, hoặc cách diễn đạt từ những câu trước đó.
	""",
	"""
	YÊU CẦU ĐA DẠNG: Đảm bảo mỗi câu claim có cách tiếp cận khác biệt:
	- Sử dụng cấu trúc câu phức, câu đơn và câu ghép xen kẽ.
	- Xen kẽ các cấu trúc câu khẳng định và câu điều kiện.
	- Thay đổi vị trí trạng ngữ trong câu.
	- Kết hợp các loại mệnh đề khác nhau.
	""",
	"""
	LƯU Ý VỀ ĐA DẠNG: Đảm bảo đa dạng phong cách ngôn ngữ:
	- Thay đổi độ dài câu (ngắn, trung bình, dài).
	- Sử dụng các từ nối khác nhau.
	- Xen kẽ giữa cách diễn đạt trực tiếp và gián tiếp.
	- Kết hợp cả lối diễn đạt học thuật và bình dân.
	"""
	]

	# Add n-gram patterns from analysis
	self.ngram_patterns = {
	"SUPPORTED": [
	"miền nam việt nam",
	"hoàng sa và trường",
	"sa và trường sa",
	"quần đảo hoàng sa",
	"sân bay quốc tế",
	"báo cáo tài chính",
	"lọc hoá dầu bình",
	"được sử dụng trong",
	"là ngôn ngữ chính",
	"ngôn ngữ chính thức",
	"khu vực đông nam",
	"bình quân đầu người",
	"lấy mẫu xét nghiệm",
	"các sàn thương mại",
	"là một trong những",
	"có ít nhất một",
	"trái phép chất ma"
	],
	"REFUTED": [
	"số lượng người nói",
	"không có bất kỳ",
	"thời kỳ bắc thuộc",
	"điều tra dân số",
	"việt nam dân chủ",
	"nam dân chủ cộng",
	"dân chủ cộng hòa",
	"lượng người nói tiếng",
	"người nói tiếng anh",
	"được công nhận là",
	"học viện y dược",
	"viện y dược học",
	"y dược học cổ",
	"dược học cổ truyền",
	"học cổ truyền việt",
	"cổ truyền việt nam"
	],
	"NEI": [
	"có thể",
	"phát triển",
	"sử dụng",
	"công nghệ",
	"thực hiện",
	"ảnh hưởng",
	"thời gian",
	"khu vực"
	]
	}

	# Track all generated claims in this session
	if 'all_generated_claims' not in st.session_state:
	st.session_state.all_generated_claims = []

	self.SCHEMAS = {
	"SUPPORTED": {
	"type": "object",
	"properties": {
	"claims": {
	"type": "array",
	"items": {
	"type": "string"
	},
	"description": """
	Hướng dẫn tạo claim SUPPORTED có độ khó cao:

	1. Bắt buộc đề cập, liên quan đến Title của context
	- Câu claim phải liên quan trực tiếp đến Title của context.
	- Câu claim phải liên quan đến nội dung của context.
	- Chỉ sử dụng thông tin từ context, không dùng kiến thức ngoài.

	2. Sử dụng cấu trúc và từ vựng phức tạp hơn
	- Ưu tiên dùng cụm từ đặc trưng của nhãn REFUTED và NEI như:
	- "vào năm", "đầu tiên", "duy nhất", "trên thế giới", "chỉ có"
	- "có thể", "phát triển", "sử dụng", "công nghệ", "thực hiện", "ảnh hưởng"
	- Tạo câu claim SUPPORTED nhưng "ngụy trang" bằng cấu trúc của REFUTED/NEI:
	- "Vào năm [X], [Title] đã trở thành..." (dùng cụm từ đặc trưng của REFUTED)
	- "[Title] có thể được coi là yếu tố quan trọng..." (dùng cụm từ đặc trưng của NEI)
	- Tránh sử dụng cụm từ quá đặc trưng của SUPPORTED:
	- Hạn chế dùng "quần đảo hoàng", "trong lịch sử", "miền nam việt"

	3. Kết hợp thông tin từ nhiều phần khác nhau của context
	- Ghép thông tin từ nhiều câu trong context để tạo claim mới.
	- Thêm các chi tiết phụ nhưng vẫn nằm trong context.
	- Kết hợp nhiều n-gram đặc trưng của REFUTED/NEI trong một câu SUPPORTED.

	4. Sử dụng chiến lược "đánh lừa" đặc biệt
	- Thêm "các quốc gia", "trên thế giới" vào câu nhưng giữ nguyên nội dung SUPPORTED.
	- Dùng "không có bất kỳ" theo cách khẳng định thông tin đúng.
	- Sử dụng "trở thành một" nhưng vẫn giữ thông tin chính xác.
	- Thêm các cụm từ định lượng đặc trưng của REFUTED như "đầu tiên", "duy nhất" nhưng áp dụng chính xác.

	5. Kết hợp với từ vựng đặc trưng của NEI
	- Thêm "có thể", "phát triển", "sử dụng", "công nghệ" vào câu SUPPORTED.
	- Sử dụng "ảnh hưởng", "thời gian", "khu vực" với dữ liệu chính xác từ context.
	- Kết hợp "công ty", "tổ chức" vào câu khi phù hợp với context.

	6. Tránh lỗi phổ biến
	- Không copy nguyên văn từ context.
	- Không tạo câu quá đơn giản hoặc rõ ràng là SUPPORTED.
	- Tránh n-gram quá điển hình của SUPPORTED như "quần đảo hoàng sa", "miền nam việt nam".
	"""
	}
	},
	"required": ["claims"]
	},
	"REFUTED": {
	"type": "object",
	"properties": {
	"claims": {
	"type": "array",
	"items": {
	"type": "string"
	},
	"description": """
	Hướng dẫn tạo claim REFUTED có độ khó cao:

	1. Bắt buộc đề cập, liên quan đến Title của context
	- Claim phải liên quan đến Title của context.
	- Câu claim phải liên quan đến nội dung của context.

	2. Mượn cấu trúc và n-gram đặc trưng của SUPPORTED để tạo sự đánh lừa
	- Sử dụng n-gram đặc trưng của SUPPORTED:
	- "một số", "so với", "đối với", "lãnh thổ", "thế kỷ"
	- "trong lịch sử", "của người dân", "khoảng thời gian"
	- Tạo câu có cấu trúc giống SUPPORTED nhưng thay đổi thông tin cốt lõi.
	- Ví dụ: "Trong lịch sử phát triển của [Title], một số yếu tố đã dẫn đến..." (dùng cụm từ SUPPORTED)

	3. Kết hợp với n-gram đặc trưng của NEI
	- Thêm "có thể", "phát triển", "công nghệ" vào câu REFUTED.
	- Dùng "sử dụng", "thực hiện", "ảnh hưởng" theo cách làm sai lệch thông tin.
	- Kết hợp "tổ chức", "thời gian", "khu vực" với dữ liệu sai từ context.

	4. Chiến lược tạo thông tin sai nhưng khó phát hiện
	- Thay đổi số liệu, đơn vị đo lường: "500 triệu" → "5 triệu".
	- Dùng dữ kiện đúng nhưng gán sai nguyên nhân hoặc hậu quả.
	- Lấy một phần thông tin đúng từ context nhưng thay đổi một chi tiết quan trọng.
	- Đảo ngược quan hệ nhân quả: "X gây ra Y" → "Y gây ra X".
	- Thay thế đối tượng hoặc thời gian: "Apple phát triển iPhone" → "Samsung phát triển iPhone".

	5. Tận dụng n-gram đặc trưng của REFUTED một cách tinh tế
	- Sử dụng "vào năm" với năm sai.
	- Dùng "chỉ có" với thông tin không chính xác.
	- Áp dụng "duy nhất", "đầu tiên" với thông tin sai lệch.
	- Tận dụng cụm từ "các quốc gia", "trên thế giới" để thêm thông tin sai lệch.

	6. Tránh lỗi phổ biến
	- Không tạo câu claim quá dễ đoán là REFUTED.
	- Không sử dụng phủ định đơn giản như "không đúng", "không thể".
	- Tránh n-gram quá điển hình của REFUTED như "không có bất kỳ", "việt nam là".
	"""
	}
	},
	"required": ["claims"]
	},
	"NEI": {
	"type": "object",
	"properties": {
	"claims": {
	"type": "array",
	"items": {
	"type": "string"
	},
	"description": """
	Hướng dẫn tạo claim NEI có độ khó cao:

	1. Bắt buộc đề cập, liên quan đến Title của context
	- Claim phải liên quan đến Title của context.
	- Câu claim phải liên quan đến nội dung của context.

	2. Mượn cấu trúc và n-gram của SUPPORTED và REFUTED để tạo sự đánh lừa
	- Sử dụng n-gram của SUPPORTED:
	- "một số", "so với", "đối với", "lãnh thổ", "thế kỷ"
	- "trong lịch sử", "của người dân", "khoảng thời gian"
	- Sử dụng n-gram của REFUTED:
	- "vào năm", "chỉ có", "duy nhất", "đầu tiên"
	- "trở thành một", "liên quan đến", "được ghi nhận"
	- Kết hợp cả hai nhóm n-gram vào câu NEI để làm tăng độ khó phân biệt.

	3. Tận dụng tối đa n-gram đặc trưng của NEI
	- Tập trung vào các cụm từ: "có thể", "phát triển", "sử dụng", "công nghệ"
	- Sử dụng "công ty", "tổ chức", "thực hiện", "ảnh hưởng" theo cách mở rộng thông tin
	- Áp dụng "thời gian", "khu vực", "thế giới" vào các dự đoán hoặc xu hướng

	4. Tạo suy luận hợp lý nhưng không có bằng chứng trong context
	- "Nếu [Title] tiếp tục phát triển với tốc độ hiện tại, nó có thể trở thành tiêu chuẩn công nghiệp vào năm 2030."
	- Sử dụng n-gram đặc trưng của NEI: "sự phát triển", "có khả năng", "trong thời gian"
	- Kết hợp với cụm từ đặc trưng của SUPPORTED và REFUTED để tạo độ khó cao

	5. Kết hợp thông minh các n-gram đặc trưng
	- "[Title] có thể được xem là một trong những yếu tố đầu tiên..." (kết hợp NEI + REFUTED)
	- "Trong lịch sử phát triển, [Title] có thể ảnh hưởng đến..." (kết hợp SUPPORTED + NEI)
	- "Vào năm [X], [Title] có thể đã phát triển..." (kết hợp REFUTED + NEI)

	6. Lưu ý
	- Nếu claim có ít nhất một phần thông tin chưa xác định, thì claim đó là NEI.
	- Không tạo claim quá mơ hồ hoặc không liên quan.
	- Không sử dụng thông tin sai, chỉ dùng thông tin chưa đủ để xác định.
	- Tránh lạm dụng n-gram quá điển hình của NEI như "nhà khoa học", "con đường tơ lụa".
	"""
	}
	},
	"required": ["claims"]
	}
	}

	def _configure_api(self):
	"""Configure the API with the current key."""
	genai.configure(api_key=self.current_key)

	def _rotate_api_key(self):
	"""No rotation needed since we only have one key per type."""
	print(f"Using API key for {self.claim_type}")

	def generate_claims(self, context: str, title: str, prompt_variation: str = None, max_retries: int = 3) -> List[str]:
	"""Generate claims with basic error handling and improved diversity."""
	retries = 0

	# Add randomization to your prompts
	variation_phrases = [
	"Đừng lặp lại cấu trúc từ những câu trước đó.",
	"Tạo câu với cấu trúc hoàn toàn khác biệt.",
	"Sử dụng cách diễn đạt mới.",
	"Đổi mới cách tiếp cận trong câu này.",
	"Hãy thay đổi cấu trúc câu so với lần trước.",
	"Sáng tạo với cách diễn đạt khác biệt.",
	"Hãy tạo ra sự đa dạng về cấu trúc ngữ pháp."
	]

	random_instruction = random.choice(variation_phrases)
	diversity_instruction = random.choice(self.diversity_instructions)

	# Use prompt variation if provided, otherwise use default
	if prompt_variation is None:
	prompt_variation = self.prompt_variations[random.randint(0, len(self.prompt_variations) - 1)]

	prompt_variation = prompt_variation.format(self.claim_type)

	while retries < max_retries:
	try:
	# Adjust temperature and sampling parameters
	temperature = 0.7 + (0.4 * random.random()) # Between 0.7 and 1.1
	top_p = 0.85 + (0.15 * random.random()) # Between 0.85 and 1.0
	top_k = 35 + int(10 * random.random()) # Between 35 and 45

	model = genai.GenerativeModel(
	'models/gemini-1.5-flash-latest',
	generation_config={
	"response_mime_type": "application/json",
	"response_schema": self.SCHEMAS[self.claim_type],
	"temperature": temperature,
	"top_p": top_p,
	"top_k": top_k,
	"max_output_tokens": 512
	}
	)

	prompt = f"""
	Fact checking là 1 trong lĩnh vực xử lý ngôn ngữ tự nhiên (NLP), là bài toán kiểm tra và xác minh tính chính xác của 1 nhận định được đưa ra trên các thông tin, tin tức từ mạng xã hội, báo chí. Mô hình NLP sẽ xác định câu nhận định đó là chính xác hay sai sự thật hoặc không được đề cập đến trong thông tin đã được cung cấp trước đó (context).

	Tiêu đề: "{title}"
	Context: "{context}"

	Nhiệm vụ: {prompt_variation} Yêu cầu dài 15-60 từ, có suy luận hợp lý từ nội dung context.

	QUAN TRỌNG: Sử dụng các cụm từ đặc trưng của các nhãn khác để đánh lừa mô hình:
	{', '.join([pattern for label in ['SUPPORTED', 'REFUTED', 'NEI']
	if label != self.claim_type
	for pattern in self.ngram_patterns[label][:8]])}

	{random_instruction}

	{diversity_instruction}

	Yêu cầu chi tiết:
	1. BẮT BUỘC sử dụng ít nhất 2-3 cụm từ từ danh sách n-gram được cung cấp.
	2. Kết hợp thông minh các cụm từ để tạo câu có ý nghĩa và khó dự đoán.
	3. Câu claim PHẢI liên quan đến tiêu đề và nội dung của context.
	4. TRÁNH copy nguyên văn từ context.
	5. Sử dụng cấu trúc câu phức tạp, kết hợp nhiều thông tin.
	6. Tránh lặp lại cấu trúc câu từ những lần tạo trước.

	Trả về kết quả theo đúng format JSON với trường {self.claim_type}.
	"""

	prompt = re.sub(r'\s+', ' ', prompt).strip()
	prompt = re.sub(r'\n', ' ', prompt).strip()
	response = model.generate_content(prompt)
	time.sleep(3) # Prevent rate limiting

	claims_response = json.loads(response.text)["claims"]

	# Filter out claims with similar patterns or already seen claims
	filtered_claims = []
	for claim in claims_response:
	# Get pattern from first few words
	words = claim.split()
	if len(words) > 3:
	pattern = ' '.join(words[:4]).lower()
	else:
	pattern = claim.lower()

	if pattern not in self.used_patterns and claim not in st.session_state.all_generated_claims:
	self.used_patterns.add(pattern)
	st.session_state.all_generated_claims.append(claim)
	filtered_claims.append(claim)

	# If all claims were filtered, return at least one
	if not filtered_claims and claims_response:
	filtered_claims = [claims_response[0]]
	if claims_response[0] not in st.session_state.all_generated_claims:
	st.session_state.all_generated_claims.append(claims_response[0])

	return filtered_claims or claims_response

	except Exception as e:
	st.error(f"Error during claim generation: {str(e)}")
	retries += 1
	if retries >= max_retries:
	st.error(f"Max retries ({max_retries}) exceeded")
	time.sleep(2) # Wait before retrying

	return [] # Return empty list if all retries failed

	def generate_hard_claims(self, context: str, title: str, predict_func, max_attempts: int = 3) -> Dict[str, Any]:
	"""Generate and validate multiple claims."""
	results = []
	all_attempts = []
	attempts = 0

	# Xử lý mỗi lần tạo claim
	while len(results) < 3 and attempts < max_attempts:
	attempts += 1

	# Select a different prompt variation for each attempt
	prompt_variation = self.prompt_variations[attempts % len(self.prompt_variations)]

	# Tạo claims một lần cho mỗi attempt
	claims = self.generate_claims(context, title, prompt_variation=prompt_variation)
	if not claims: # Kiểm tra nếu không có claims được tạo
	continue

	successful_claims_in_batch = 0

	for claim in claims:
	# Bỏ qua claim đã có trong results
	if claim in [r['claim'] for r in results]:
	continue

	prediction = predict_func(context, claim)
	current_prob = prediction['probabilities'][self.claim_type]
	other_labels = [label for label in ["SUPPORTED", "REFUTED", "NEI"] if label != self.claim_type]
	other_probs = [prediction['probabilities'][label] for label in other_labels]
	sum_other_probs = sum(other_probs)

	print(f"Attempt {attempts}, Claim: {claim}")
	print(f"Current label probability: {current_prob:.2%}")
	print(f"Other labels probabilities: {other_probs[0]:.2%}, {other_probs[1]:.2%}\n")

	# Tạo entry cho attempt này
	attempt_entry = {
	'claim': claim,
	'predicted_label': prediction['verdict'],
	f'{self.claim_type}_probability': current_prob,
	'other_probabilities': dict(zip(other_labels, other_probs)),
	'all_probabilities': prediction['probabilities']
	}

	all_attempts.append(attempt_entry)

	# Validation logic dựa trên loại claim
	if self.claim_type == "NEI":
	success = (current_prob < sum_other_probs) or any(current_prob < (other_prob + 0.1) for other_prob in other_probs)
	else:
	success = current_prob < sum_other_probs

	if success:
	results.append(attempt_entry)
	successful_claims_in_batch += 1

	# Thoát sớm nếu đã đủ 2 kết quả
	if len(results) == 3:
	break

	# Nếu batch này không tạo được claim thành công nào và đã thử nhiều lần, giảm tiêu chí
	if successful_claims_in_batch == 0 and attempts >= 3 and len(results) < 2:
	# Chọn các claims tốt nhất từ những gì đã có nếu còn thiếu
	remaining_needed = 2 - len(results)
	if all_attempts:
	# Sắp xếp theo xác suất tăng dần của claim type
	potential_claims = [a for a in all_attempts if a['claim'] not in [r['claim'] for r in results]]
	potential_claims.sort(key=lambda x: x[f'{self.claim_type}_probability'])

	# Thêm các claims tốt nhất vào results nếu có
	for i in range(min(remaining_needed, len(potential_claims))):
	results.append(potential_claims[i])

	# Thoát sớm nếu đã đủ kết quả
	if len(results) == 3:
	break

	# Nếu vẫn chưa đủ 2 claim, lấy các claim có xác suất thấp nhất
	if len(results) < 3 and all_attempts:
	remaining_needed = 3 - len(results)
	potential_claims = [a for a in all_attempts if a['claim'] not in [r['claim'] for r in results]]
	potential_claims.sort(key=lambda x: x[f'{self.claim_type}_probability'])

	for i in range(min(remaining_needed, len(potential_claims))):
	results.append(potential_claims[i])

	return {
	'success': len(results) >= 2,
	'attempts': attempts,
	'results': results
	}

	def process_dataframe_with_claims(df: pd.DataFrame, selected_claim_types: List[str]) -> pd.DataFrame:
	"""Process each summary in the DataFrame and generate claims with fallback handling."""
	all_rows = []
	progress_bar = st.progress(0)

	for i, row in df.iterrows():
	context = row['Summary']
	title = row['Title']

	for claim_type in selected_claim_types:
	generator = ClaimGenerator(claim_type)
	result = generator.generate_hard_claims(context, title, predict)

	if result['success']:
	for claim_result in result['results']:
	new_row = row.copy()
	new_row['claim'] = claim_result['claim']
	new_row['label'] = claim_type
	new_row['predicted_label'] = claim_result['predicted_label']
	new_row['probability'] = claim_result[f'{claim_type}_probability']
	new_row['all_probabilities'] = json.dumps(claim_result['all_probabilities'])
	new_row['used_fallback'] = result.get('used_fallback', False)
	all_rows.append(new_row)

	if result.get('used_fallback'):
	st.warning(f"Used fallback claims for article {row['ID']} {claim_type}")

	progress_bar.progress((i + 1) / len(df))

	new_df = pd.DataFrame(all_rows)
	return new_df

	def generate_claims_for_context(
	context: str,
	claim_types: List[str],
	predict_func,
	progress_bar
	) -> List[Dict[str, Any]]:
	"""Generate claims for the context with simplified processing."""
	claims_data = []
	total_steps = len(claim_types)

	for i, claim_type in enumerate(claim_types):
	progress_bar.progress((i / total_steps), f"Generating {claim_type} claims...")

	generator = ClaimGenerator(claim_type)
	result = generator.generate_hard_claims(context, predict_func)

	if result['success']:
	for claim_result in result['results']:
	claim_data = {
	'context': context,
	'claim': claim_result['claim'],
	'label': claim_type,
	'predicted_label': claim_result['predicted_label'],
	f'{claim_type}_probability': claim_result[f'{claim_type}_probability'],
	'probabilities': claim_result['all_probabilities']
	}
	claims_data.append(claim_data)

	return claims_data

	# Wikipedia helper functions
	def clean_wikipedia_content(content):
	# Remove section headers
	content = re.sub(r'==+.*?==+', '', content)

	# Remove citation references [1], [2], etc.
	content = re.sub(r'\[\d+\]', '', content)

	# Remove "See also", "References", "External links" sections and everything after
	patterns = [
	r'== Xem thêm ==.*',
	r'== Tham khảo ==.*',
	r'== Liên kết ngoài ==.*',
	r'== Chú thích ==.*',
	r'== Ghi chú ==.*'
	]
	for pattern in patterns:
	content = re.sub(pattern, '', content, flags=re.DOTALL)

	# Replace multiple newlines with single newline
	content = re.sub(r'\n\s*\n', '\n', content)

	# Clean extra whitespace
	content = re.sub(r' +', ' ', content)
	lines = [line.strip() for line in content.split("\n")]
	cleaned_content = "\n\n".join([line for line in lines if line])

	return cleaned_content

	def split_vietnamese_paragraphs(text):
	# Split by paragraph first (empty lines)
	paragraphs = re.split(r'\n\s*\n', text)
	paragraphs = [p.strip() for p in paragraphs if p.strip()]
	return paragraphs

	def split_sentences_vietnamese(text):
	# Pre-processing: protect periods in abbreviations and special cases
	# Common Vietnamese abbreviations
	abbreviations = ['TS.', 'PGS.', 'GS.', 'ThS.', 'BS.', 'TP.', 'Tp.', 'T.P', 'Q.', 'P.']

	# Temporarily replace special cases
	for abbr in abbreviations:
	text = text.replace(abbr, abbr.replace('.', '@'))

	# Protect ellipses
	text = text.replace("...", "###ELLIPSIS###")

	# Split on sentence boundaries
	pattern = r'(?<=[.!?])\s+(?=[A-ZÀÁẢÃẠĂẰẮẲẴẶÂẦẤẨẪẬĐÈÉẺẼẸÊỀẾỂỄỆÌÍỈĨỊÒÓỎÕỌÔỒỐỔỖỘƠỜỚỞỠỢÙÚỦŨỤƯỪỨỬỮỰỲÝỶỸỴ])'
	sentences = re.split(pattern, text)

	# Restore special cases
	sentences = [s.replace("###ELLIPSIS###", "...") for s in sentences]
	for abbr in abbreviations:
	abbr_modified = abbr.replace('.', '@')
	for i in range(len(sentences)):
	sentences[i] = sentences[i].replace(abbr_modified, abbr)

	# Clean sentences
	sentences = [s.strip() for s in sentences if s.strip()]

	return sentences

	def merge_short_sentences(sentences, min_length=30):
	"""Merge very short sentences with the next sentence if possible"""
	if not sentences:
	return []

	result = []
	current = sentences[0]

	for i in range(1, len(sentences)):
	if len(current) < min_length:
	current += ' ' + sentences[i]
	else:
	result.append(current)
	current = sentences[i]

	if current:
	result.append(current)

	return result

	def chunk_content(text, max_length=700, min_length=210):
	"""Split content into chunks of appropriate size"""
	paragraphs = split_vietnamese_paragraphs(text)
	chunks = []
	current_chunk = []
	current_length = 0

	for paragraph in paragraphs:
	sentences = split_sentences_vietnamese(paragraph)
	sentences = merge_short_sentences(sentences)

	for sentence in sentences:
	if current_length + len(sentence) > max_length and current_length >= min_length:
	# Save current chunk
	chunks.append(' '.join(current_chunk))
	current_chunk = [sentence]
	current_length = len(sentence)
	else:
	current_chunk.append(sentence)
	current_length += len(sentence) + 1 # +1 for space

	# Add remaining content
	if current_chunk:
	chunks.append(' '.join(current_chunk))

	return chunks

	def chars_to_nums(s):
	return ''.join(str((char_to_num.get(char.upper(), 0) % 4) + 1) for char in s if char.upper() in char_to_num)

	def create_id(topic, stt):
	# Remove accents and convert to lowercase
	def remove_accents(s):
	s = s.lower()
	s = s.replace('đ', 'd')
	accents = {
	'à': 'a', 'á': 'a', 'ả': 'a', 'ã': 'a', 'ạ': 'a',
	'ằ': 'a', 'ắ': 'a', 'ẳ': 'a', 'ẵ': 'a', 'ặ': 'a',
	'ầ': 'a', 'ấ': 'a', 'ẩ': 'a', 'ẫ': 'a', 'ậ': 'a',
	'è': 'e', 'é': 'e', 'ẻ': 'e', 'ẽ': 'e', 'ẹ': 'e',
	'ề': 'e', 'ế': 'e', 'ể': 'e', 'ễ': 'e', 'ệ': 'e',
	'ì': 'i', 'í': 'i', 'ỉ': 'i', 'ĩ': 'i', 'ị': 'i',
	'ò': 'o', 'ó': 'o', 'ỏ': 'o', 'õ': 'o', 'ọ': 'o',
	'ồ': 'o', 'ố': 'o', 'ổ': 'o', 'ỗ': 'o', 'ộ': 'o',
	'ờ': 'o', 'ớ': 'o', 'ở': 'o', 'ỡ': 'o', 'ợ': 'o',
	'ù': 'u', 'ú': 'u', 'ủ': 'u', 'ũ': 'u', 'ụ': 'u',
	'ừ': 'u', 'ứ': 'u', 'ử': 'u', 'ữ': 'u', 'ự': 'u',
	'ỳ': 'y', 'ý': 'y', 'ỷ': 'y', 'ỹ': 'y', 'ỵ': 'y'
	}
	for accent, base in accents.items():
	s = s.replace(accent, base)
	return s

	# Clean and abbreviate topic
	topic_clean = remove_accents(topic)
	topic_abbr = ''.join([word[0] for word in topic_clean.split() if word])

	# Generate numeric representation
	def chars_to_nums(s):
	char_to_num = {
	'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5,
	'f': 6, 'g': 7, 'h': 8, 'i': 9, 'j': 10,
	'k': 11, 'l': 12, 'm': 13, 'n': 14, 'o': 15,
	'p': 16, 'q': 17, 'r': 18, 's': 19, 't': 20,
	'u': 21, 'v': 22, 'w': 23, 'x': 24, 'y': 25, 'z': 26
	}
	return ''.join(str((char_to_num.get(char, 0) % 4) + 1) for char in s)

	# Generate unique ID
	topic_num = chars_to_nums(topic_abbr)
	return f'uit_{topic_num}_{stt}'

	def wikipedia_scrape(title_input, stt, filename):
	try:
	page = wikipedia.page(title_input)
	title = page.title
	content = wikipedia.summary(title_input, sentences=15)
	url = page.url
	topic = filename.split(".")[0]
	base_article_id = create_id(topic, stt)

	# Clean and prepare content
	cleaned_content = clean_wikipedia_content(content)

	# Chunk content into appropriate parts
	chunks = chunk_content(cleaned_content)

	results = []
	for i, chunk_text in enumerate(chunks, 1):
	if len(chunk_text) >= 210: # Only include chunks that meet minimum length
	article_id = f"{base_article_id}_{i}"
	results.append({
	"ID": article_id,
	"Title": title,
	"Topic": topic,
	"Summary": chunk_text,
	"URL": url
	})

	return results
	except (wikipedia.exceptions.DisambiguationError, wikipedia.exceptions.PageError) as e:
	print(f"Error processing {title_input}: {e}")
	return None

	@st.cache_data
	def convert_df_to_csv(df):
	return df.to_csv(index=False).encode("utf-8-sig")

	def main():
	st.title("Ứng dụng crawl data wiki tiếng việt và tạo claims")

	uploaded_file = st.file_uploader("Tải lên tệp văn bản (txt)", type=["txt"])

	if uploaded_file is not None:
	file_progress = st.progress(0)
	status_text = st.empty()

	file_contents = uploaded_file.getvalue().decode("utf-8")
	titles = file_contents.split("\n")

	articles_info = []
	for index, title in enumerate(titles):
	title = title.strip()
	if title:
	status_text.text(f"Processing article {index + 1}/{len(titles)}: {title}")
	article_infos = wikipedia_scrape(title, index + 1, uploaded_file.name)

	# Safely append results, handling potential None values
	if article_infos:
	articles_info.extend(article_infos)
	file_progress.progress((index + 1) / len(titles))
	else:
	st.warning(f"Dòng {index + 1}: Tiêu đề trống")

	file_progress.empty()
	status_text.empty()

	if not articles_info:
	st.error("Không có bài viết nào được thu thập từ Wikipedia.")
	else:
	df = pd.DataFrame(articles_info)

	st.subheader("Danh sách bài viết từ Wikipedia:")
	st.dataframe(df, use_container_width=True)

	# Add claim type selection
	claim_types = ["SUPPORTED", "REFUTED", "NEI"]
	selected_claim_types = st.multiselect(
	"Chọn loại claim cần tạo:",
	claim_types,
	default=claim_types
	)

	# Add automatic claim generation for all summaries
	if st.button("Tạo Claims cho tất cả bài viết"):
	if not selected_claim_types:
	st.error("Vui lòng chọn ít nhất một loại claim để tạo.")
	else:
	with st.spinner("Đang tạo claims cho tất cả bài viết..."):

	# Process the DataFrame and generate claims
	df_with_claims = process_dataframe_with_claims(df, selected_claim_types)

	st.subheader("Bảng dữ liệu với Claims:")
	st.dataframe(df_with_claims, use_container_width=True)

	# Download options
	csv = convert_df_to_csv(df_with_claims)
	claims_filename = f"uit_{uploaded_file.name.split('.')[0]}_with_claims.csv"
	st.download_button(
	label="Download CSV với Claims",
	data=csv,
	file_name=claims_filename,
	mime="text/csv",
	)

	# Original download option for basic DataFrame
	csv = convert_df_to_csv(df)
	csv_filename = "uit_" + uploaded_file.name.split(".")[0] + ".csv"
	st.download_button(
	label="Download CSV (không có claims)",
	data=csv,
	file_name=csv_filename,
	mime="text/csv",
	)

	if __name__ == "__main__":
	main()