Spaces:

tiya1012
/

pdpatest2

Sleeping

App Files Files Community

pdpatest2 / app.py

tiya1012

Update app.py

62b6f90 verified 4 months ago

raw

history blame contribute delete

13.9 kB

	import requests
	import PyPDF2
	from io import BytesIO
	from openai import OpenAI
	import numpy as np
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.metrics.pairwise import cosine_similarity
	import re
	from typing import List, Tuple
	import gradio as gr # Import gradio

	# Initialize the client
	client = OpenAI(
	api_key="sk-EiLiW1tVzR6ra7LoAvAWRbppMJWnezTanz3AfvvVrGYBEN1b", # โปรดเก็บ API key ของคุณให้ปลอดภัย ไม่ควร hardcode แบบนี้ในโค้ดจริง
	base_url="https://api.opentyphoon.ai/v1"
	)

	class PDPAKnowledgeBase:
	def __init__(self, pdf_url: str):
	self.pdf_url = pdf_url
	self.chunks = []
	# max_features might need adjustment based on PDF content, but 1000 is a reasonable start.
	self.vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)
	self.chunk_vectors = None
	self.load_and_process_pdf()

	def download_pdf(self) -> bytes:
	"""Download PDF from GitHub URL"""
	print("📥 กำลังดาวน์โหลด PDPA PDF...")
	try:
	response = requests.get(self.pdf_url, timeout=30)
	response.raise_for_status()
	print("✅ ดาวน์โหลดสำเร็จ!")
	return response.content
	except Exception as e:
	print(f"❌ ไม่สามารถดาวน์โหลด PDF ได้: {e}")
	return None

	def extract_text_from_pdf(self, pdf_content: bytes) -> str:
	"""Extract text from PDF content"""
	print("📄 กำลังแยกข้อความจาก PDF...")
	try:
	pdf_file = BytesIO(pdf_content)
	pdf_reader = PyPDF2.PdfReader(pdf_file)

	text = ""
	for page_num, page in enumerate(pdf_reader.pages):
	try:
	page_text = page.extract_text()
	# Add page separator for context if chunks overlap across pages
	text += f"\n--- หน้า {page_num + 1} ---\n{page_text}\n"
	except Exception as e:
	print(f"⚠️ ไม่สามารถอ่านหน้า {page_num + 1}: {e}")
	continue

	print(f"✅ แยกข้อความสำเร็จ! จำนวน {len(pdf_reader.pages)} หน้า")
	return text
	except Exception as e:
	print(f"❌ ไม่สามารถแยกข้อความได้: {e}")
	return ""

	def chunk_text(self, text: str, chunk_size: int = 1000, overlap: int = 200) -> List[str]:
	"""Split text into overlapping chunks"""
	print("✂️ กำลังแบ่งข้อความเป็นส่วนๆ...")

	# Clean text
	text = re.sub(r'\s+', ' ', text.strip())

	chunks = []
	start = 0

	while start < len(text):
	end = start + chunk_size

	# Try to break at sentence end
	if end < len(text):
	# Look for sentence endings
	for i in range(end, max(start + chunk_size - 200, start), -1):
	if text[i] in '.!?':
	end = i + 1
	break

	chunk = text[start:end].strip()
	if chunk:
	chunks.append(chunk)

	start = end - overlap
	if start >= len(text):
	break

	print(f"✅ แบ่งเป็น {len(chunks)} ส่วน")
	return chunks

	def create_embeddings(self, chunks: List[str]):
	"""Create TF-IDF vectors for chunks"""
	print("🔢 กำลังสร้าง embeddings...")
	try:
	self.chunk_vectors = self.vectorizer.fit_transform(chunks)
	print("✅ สร้าง embeddings สำเร็จ!")
	except Exception as e:
	print(f"❌ ไม่สามารถสร้าง embeddings ได้: {e}")

	def load_and_process_pdf(self):
	"""Download and process the PDF"""
	pdf_content = self.download_pdf()
	if not pdf_content:
	return

	text = self.extract_text_from_pdf(pdf_content)
	if not text:
	return

	self.chunks = self.chunk_text(text)
	if self.chunks:
	self.create_embeddings(self.chunks)

	def search_relevant_chunks(self, query: str, top_k: int = 3) -> List[Tuple[str, float]]:
	"""Search for relevant chunks based on query"""
	if not self.chunks or self.chunk_vectors is None:
	return []

	try:
	# Check if query contains only stop words or is too short after tokenization
	processed_query = ' '.join([word for word in self.vectorizer.build_tokenizer()(query) if word not in self.vectorizer.get_stop_words()])
	if not processed_query:
	return []

	# Vectorize the query
	query_vector = self.vectorizer.transform([query])

	# Calculate similarities
	similarities = cosine_similarity(query_vector, self.chunk_vectors)[0]

	# Get top-k most similar chunks
	top_indices = np.argsort(similarities)[::-1][:top_k]

	results = []
	for idx in top_indices:
	if similarities[idx] > 0.1: # Minimum similarity threshold
	results.append((self.chunks[idx], similarities[idx]))

	return results
	except Exception as e:
	print(f"⚠️ ข้อผิดพลาดในการค้นหา: {e}")
	return []

	# --- Gradio Integration ---

	# Global knowledge base instance and RAG status
	pdf_url = "https://raw.githubusercontent.com/tiya1012/pdpa/main/PDPA_Guideline_v_1_merged.pdf"
	kb = PDPAKnowledgeBase(pdf_url)

	if not kb.chunks:
	print("❌ ไม่สามารถโหลดข้อมูล PDPA ได้ กำลังใช้โหมดปกติ...")
	use_rag = False
	else:
	print(f"✅ โหลดข้อมูล PDPA สำเร็จ! มีข้อมูล {len(kb.chunks)} ส่วน")
	use_rag = True

	def predict_chat(message: str, history: List[Tuple[str, str]]) -> str:
	"""
	Function to handle chat predictions for Gradio interface.
	Args:
	message (str): The user's current input message.
	history (List[Tuple[str, str]]): List of previous conversation turns.
	Each turn is a tuple of (user_message, bot_message).
	Returns:
	str: The chatbot's response.
	"""

	global use_rag, kb # Access global variables

	# Initialize messages for the current turn, including system prompt
	messages = [
	{"role": "system", "content": "คุณเป็นผู้เชี่ยวชาญด้าน PDPA ให้คำแนะนำเกี่ยวกับกฎหมาย PDPA โดยตอบเป็นภาษาไทยเท่านั้น หากมีข้อมูลอ้างอิงจากเอกสาร PDPA ให้อ้างอิงแหล่งที่มาด้วย"}
	]

	# Add previous conversation history from Gradio's format to OpenAI's format
	for human_msg, ai_msg in history:
	messages.append({"role": "user", "content": human_msg})
	messages.append({"role": "assistant", "content": ai_msg})

	# Search for relevant information if RAG is available
	context = ""
	if use_rag:
	relevant_chunks = kb.search_relevant_chunks(message)

	if relevant_chunks:
	context = "\n\nข้อมูลอ้างอิงจากเอกสาร PDPA:\n"
	for i, (chunk, score) in enumerate(relevant_chunks, 1):
	context += f"\n[อ้างอิง {i}] {chunk[:500]}...\n" # Limit chunk preview to 500 chars

	# Create enhanced prompt with context
	enhanced_input = message
	if context:
	enhanced_input = f"{message}\n{context}\n\nโปรดตอบโดยอ้างอิงจากข้อมูลข้างต้นหากเกี่ยวข้อง และระบุแหล่งอ้างอิงด้วย"

	# Add current user message to conversation for the LLM
	messages.append({"role": "user", "content": enhanced_input})

	try:
	# Make the API call (not streaming directly to Gradio ChatInterface, but getting full response)
	completion = client.chat.completions.create(
	model="typhoon-v2.1-12b-instruct",
	messages=messages,
	temperature=0.7,
	max_tokens=2048,
	top_p=0.9,
	stream=False # For easier integration with gr.ChatInterface, get full response at once
	)

	full_response = completion.choices[0].message.content
	return full_response

	except Exception as e:
	return f"❌ เกิดข้อผิดพลาด: {e}. กรุณาลองใหม่อีกครั้ง."


	if __name__ == "__main__":
	print("\n🚀 กำลังเริ่มต้น Gradio Chatbot...")

	# Custom CSS for a nicer look within Gradio
	custom_css = """
	body {
	background-color: #f4f7f6; /* Light grey background */
	font-family: 'IBM Plex Sans Thai', 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
	}
	.gradio-container {
	max-width: 900px;
	margin: 40px auto;
	box-shadow: 0 4px 15px rgba(0,0,0,0.1);
	border-radius: 15px;
	overflow: hidden;
	}
	.panel-header {
	background-color: #28a745; /* Green header */
	color: white;
	padding: 18px 25px;
	text-align: center;
	font-size: 1.6em;
	font-weight: bold;
	border-top-left-radius: 15px;
	border-top-right-radius: 15px;
	}
	.chat-message.bot {
	background-color: #e6ffed; /* Light green for bot messages */
	border-left: 5px solid #28a745;
	padding: 12px 18px;
	border-radius: 10px;
	margin-bottom: 10px;
	margin-right: 15%; /* Keep some space on the right */
	word-wrap: break-word; /* Ensure long words break */
	}
	.chat-message.user {
	background-color: #e0f2f7; /* Light blue for user messages */
	border-right: 5px solid #007bff;
	padding: 12px 18px;
	border-radius: 10px;
	margin-bottom: 10px;
	margin-left: 15%; /* Keep some space on the left */
	word-wrap: break-word; /* Ensure long words break */
	}
	.gr-button {
	background-color: #007bff; /* Blue buttons */
	color: white;
	border-radius: 8px;
	padding: 10px 18px;
	font-size: 1em;
	border: none;
	transition: background-color 0.3s ease;
	}
	.gr-button:hover {
	background-color: #0056b3;
	}
	.gr-textarea {
	border-radius: 8px;
	border: 1px solid #ced4da;
	}
	.gradio-container h1 {
	padding: 15px 0;
	margin-bottom: 0;
	}
	/* Style for the chatbot examples */
	.gr-samples-container button {
	background-color: #f8f9fa;
	color: #495057;
	border: 1px solid #dee2e6;
	border-radius: 5px;
	padding: 8px 12px;
	margin: 5px;
	cursor: pointer;
	transition: all 0.2s ease-in-out;
	}
	.gr-samples-container button:hover {
	background-color: #e2e6ea;
	border-color: #dae0e5;
	color: #212529;
	}
	"""

	# Gradio ChatInterface setup
	demo = gr.ChatInterface(
	fn=predict_chat,
	title="🤖 Typhoon PDPA RAG Expert Chatbot",
	description="คุณเป็นผู้เชี่ยวชาญด้าน PDPA ให้คำแนะนำเกี่ยวกับกฎหมาย PDPA โดยตอบเป็นภาษาไทยเท่านั้น หากมีข้อมูลอ้างอิงจากเอกสาร PDPA ให้อ้างอิงแหล่งที่มาด้วย",
	examples=[
	"PDPA คืออะไร?",
	"สิทธิของเจ้าของข้อมูลส่วนบุคคลมีอะไรบ้าง?",
	"ใครคือผู้ควบคุมข้อมูลส่วนบุคคล?",
	"กรณีใดบ้างที่สามารถเก็บรวบรวมข้อมูลส่วนบุคคลได้โดยไม่ต้องขอความยินยอม?",
	"การแจ้งวัตถุประสงค์ในการเก็บข้อมูลสำคัญอย่างไร?"
	],
	chatbot=gr.Chatbot(height=500), # Adjust chatbot display height
	theme=gr.themes.Soft(), # Use a soft theme, combined with custom CSS
	css=custom_css, # Apply custom CSS
	# Removed these arguments as they are not directly supported by gr.ChatInterface init
	# clear_btn="ล้างการสนทนา",
	# undo_btn="เลิกทำ",
	# retry_btn="ลองอีกครั้ง"
	)

	# Launch the Gradio interface
	# เมื่อรันสคริปต์นี้ Gradio จะแสดง URL ในคอนโซล (เช่น http://127.0.0.1:7860)
	# URL นี้คือสิ่งที่คุณต้องใช้ใน iframe ในไฟล์ HTML ของคุณ
	demo.launch(share=True)