tiya1012 commited on
Commit
2db260a
·
verified ·
0 Parent(s):

initial commit

Browse files
Files changed (4) hide show
  1. .gitattributes +35 -0
  2. README.md +14 -0
  3. app.py +116 -0
  4. requirements.txt +5 -0
.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Pdpatest
3
+ emoji: 🌍
4
+ colorFrom: indigo
5
+ colorTo: blue
6
+ sdk: gradio
7
+ sdk_version: 5.44.1
8
+ app_file: app.py
9
+ pinned: false
10
+ license: mit
11
+ short_description: pdpa chatbot in Thai
12
+ ---
13
+
14
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This is a Gradio app that integrates a PDPA Knowledge Base and a chatbot using OpenAI's API.
2
+ import gradio as gr
3
+ import numpy as np
4
+ from sklearn.feature_extraction.text import TfidfVectorizer
5
+ from sklearn.metrics.pairwise import cosine_similarity
6
+ import re
7
+ from typing import List, Tuple
8
+ import requests
9
+ import PyPDF2
10
+ from io import BytesIO
11
+ from openai import OpenAI
12
+
13
+ # Initialize the client
14
+ client = OpenAI(
15
+ api_key="sk-EiLiW1tVzR6ra7LoAvAWRbppMJWnezTanz3AfvvVrGYBEN1b", # โปรดเก็บ API key ของคุณให้ปลอดภัย ไม่ควร hardcode แบบนี้ในโค้ดจริง
16
+ base_url="https://api.opentyphoon.ai/v1"
17
+ )
18
+
19
+ class PDPAKnowledgeBase:
20
+ def __init__(self, pdf_url: str):
21
+ self.pdf_url = pdf_url
22
+ self.chunks = []
23
+ # max_features might need adjustment based on PDF content, but 1000 is a reasonable start.
24
+ self.vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)
25
+ self.chunk_vectors = None
26
+ self.load_and_process_pdf()
27
+
28
+ def download_pdf(self) -> bytes:
29
+ """Download PDF from GitHub URL"""
30
+ print("📥 กำลังดาวน์โหลด PDPA PDF...")
31
+ try:
32
+ response = requests.get(self.pdf_url, timeout=30)
33
+ response.raise_for_status()
34
+ print("✅ ดาวน์โหลดสำเร็จ!")
35
+ return response.content
36
+ except Exception as e:
37
+ print(f"❌ ไม่สามารถดาวน์โหลด PDF ได้: {e}")
38
+ return None
39
+
40
+ def extract_text_from_pdf(self, pdf_content: bytes) -> str:
41
+ """Extract text from PDF content"""
42
+ print("📄 กำลังแยกข้อความจาก PDF...")
43
+ try:
44
+ pdf_file = BytesIO(pdf_content)
45
+ pdf_reader = PyPDF2.PdfReader(pdf_file)
46
+
47
+ text = ""
48
+ for page_num, page in enumerate(pdf_reader.pages):
49
+ try:
50
+ page_text = page.extract_text()
51
+ # Add page separator for context if chunks overlap across pages
52
+ text += f"\n--- หน้า {page_num + 1} ---\n{page_text}\n"
53
+ except Exception as e:
54
+ print(f"⚠️ ไม่สามารถอ่านหน้า {page_num + 1}: {e}")
55
+ continue
56
+
57
+ print(f"✅ แยกข้อความสำเร็จ! จำนวน {len(pdf_reader.pages)} หน้า")
58
+ return text
59
+ except Exception as e:
60
+ print(f"❌ ไม่สามารถแยกข้อความได้: {e}")
61
+ return ""
62
+
63
+ def chunk_text(self, text: str, chunk_size: int = 1000, overlap: int = 200) -> List[str]:
64
+ """Split text into overlapping chunks"""
65
+ print("✂️ กำลังแบ่งข้อความเป็นส่วนๆ...")
66
+
67
+ # Clean text
68
+ text = re.sub(r'\s+', ' ', text.strip())
69
+
70
+ chunks = []
71
+ start = 0
72
+
73
+ while start < len(text):
74
+ end = start + chunk_size
75
+
76
+ # Try to break at sentence end
77
+ if end < len(text):
78
+ # Look for sentence endings
79
+ for i in range(end, max(start + chunk_size - 200, start), -1):
80
+ if text[i] in '.!?':
81
+ end = i + 1
82
+ break
83
+
84
+ chunk = text[start:end].strip()
85
+ if chunk:
86
+ chunks.append(chunk)
87
+
88
+ start = end - overlap
89
+ if start >= len(text):
90
+ break
91
+
92
+ print(f"✅ แบ่งเป็น {len(chunks)} ส่วน")
93
+ return chunks
94
+
95
+ def create_embeddings(self, chunks: List[str]):
96
+ """Create TF-IDF vectors for chunks"""
97
+ print("🔢 กำลังสร้าง embeddings...")
98
+ try:
99
+ self.chunk_vectors = self.vectorizer.fit_transform(chunks)
100
+ print("✅ สร้าง embeddings สำเร็จ!")
101
+ except Exception as e:
102
+ print(f"❌ ไม่สามารถสร้าง embeddings ได้: {e}")
103
+
104
+ def load_and_process_pdf(self):
105
+ """Download and process the PDF"""
106
+ pdf_content = self.download_pdf()
107
+ if not pdf_content:
108
+ return
109
+
110
+ text = self.extract_text_from_pdf(pdf_content)
111
+ if not text:
112
+ return
113
+
114
+ self.chunks = self.chunk_text(text)
115
+ if self.chunks:
116
+ self.create_embeddings(self.ch
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ numpy
2
+ scikit-learn
3
+ requests
4
+ PyPDF2
5
+ openai