Spaces:
darsoarafa
/
Runtime error

dini15 commited on
Commit
0d715e7
Β·
verified Β·
1 Parent(s): 173734e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +21 -30
app.py CHANGED
@@ -1,4 +1,3 @@
1
- # Import libraries
2
  from sentence_transformers import SentenceTransformer
3
  from PyPDF2 import PdfReader
4
  import tiktoken
@@ -13,7 +12,7 @@ import pickle
13
  # == Buat folder models ==
14
  os.makedirs("models", exist_ok=True)
15
 
16
- # == Load API Key dari File (Hindari Hardcoded Key) ==
17
  def load_api_key():
18
  with open("config.json", "r") as f:
19
  config = json.load(f)
@@ -23,7 +22,6 @@ GROQ_API_KEY = load_api_key()
23
 
24
  # == Ekstraksi Teks dari PDF ==
25
  def extract_text_from_pdf(pdf_file: str) -> str:
26
- """Ekstrak teks dari PDF dan gabungkan menjadi satu string."""
27
  with open(pdf_file, 'rb') as pdf:
28
  reader = PdfReader(pdf)
29
  text = " ".join(page.extract_text() or "" for page in reader.pages)
@@ -31,8 +29,7 @@ def extract_text_from_pdf(pdf_file: str) -> str:
31
 
32
  # == Chunking Teks ==
33
  def chunk_text(text: str, max_tokens: int = 512) -> list:
34
- """Membagi teks menjadi chunk berdasarkan token menggunakan tokenizer OpenAI."""
35
- tokenizer = tiktoken.get_encoding("cl100k_base") # Gunakan tokenizer OpenAI
36
  tokens = tokenizer.encode(text)
37
 
38
  chunks = []
@@ -43,41 +40,38 @@ def chunk_text(text: str, max_tokens: int = 512) -> list:
43
 
44
  return chunks
45
 
46
- # == Embedding dengan Ollama ==
 
 
47
  def get_embedding(text: str):
48
- """Mendapatkan embedding dari teks menggunakan Ollama."""
49
- model = SentenceTransformer('all-MiniLM-L6-v2')
50
- embedding = model.encode("This is a test sentence.")
51
- return np.array(embedding["embeddings"][0], dtype=np.float32) # Pastikan mengambil list pertama
52
-
53
- # == Simpan Embedding ke FAISS ==
54
- d = 1024 # Dimensi embedding dari model `mxbai-embed-large`
55
- index = faiss.IndexFlatL2(d) # Inisialisasi FAISS Index
56
  text_chunks = []
57
 
58
  def add_to_db(text_chunks_local):
59
- """Menambahkan embedding ke FAISS."""
60
  global text_chunks
61
- text_chunks = text_chunks_local # Simpan chunk ke global var
62
- embeddings = np.array([get_embedding(text) for text in text_chunks], dtype=np.float32)
63
  index.add(embeddings)
64
 
65
  def search_db(query, k=5):
66
- """Melakukan pencarian query dalam FAISS Index."""
 
 
67
  query_embedding = np.array([get_embedding(query)], dtype=np.float32).reshape(1, -1)
68
  distances, indices = index.search(query_embedding, k)
69
- return [text_chunks[i] for i in indices[0]] # Ambil teks chunk yang relevan
70
 
71
  def save_to_faiss(index_path="vector_index.faiss"):
72
- """Menyimpan FAISS index ke file."""
73
  faiss.write_index(index, index_path)
74
 
75
  def load_faiss(index_path="vector_index.faiss"):
76
- """Memuat kembali FAISS index dari file."""
77
  global index
78
  index = faiss.read_index(index_path)
79
 
80
- # == Simpan dan Load Model Embedding ==
81
  def save_embeddings(embeddings_path="models/embeddings.pkl"):
82
  with open(embeddings_path, "wb") as f:
83
  pickle.dump(index, f)
@@ -91,7 +85,6 @@ def load_embeddings(embeddings_path="models/embeddings.pkl"):
91
  client = groq.Client(api_key=GROQ_API_KEY)
92
 
93
  def query_llama(prompt):
94
- """Menggunakan LLaMA untuk menjawab pertanyaan dengan konteks yang diberikan."""
95
  response = client.chat.completions.create(
96
  model="llama3-8b-8192",
97
  messages=[{"role": "user", "content": prompt}],
@@ -102,14 +95,12 @@ def query_llama(prompt):
102
  # == Main Workflow ==
103
  if __name__ == '__main__':
104
  pdf_text = extract_text_from_pdf('dini_anggriyani_synthetic_data.pdf')
105
- text_chunks = chunk_text(pdf_text, max_tokens=1024) # Sesuaikan dengan LLaMA
106
 
107
- # Tambahkan ke database FAISS
108
  add_to_db(text_chunks)
109
- save_to_faiss() # Simpan FAISS index
110
- save_embeddings()
111
 
112
- # Tes pencarian RAG
113
  retrieved_chunks = search_db("Apa isi dokumen ini?")
114
  context = "\n".join(retrieved_chunks)
115
 
@@ -117,9 +108,9 @@ if __name__ == '__main__':
117
  answer = query_llama(prompt)
118
  print(answer)
119
 
120
- # == Buat Chatbot Interface ==
121
  def chatbot_interface(user_query):
122
- retrieved_chunks = search_db(user_query) # Sudah berupa teks
123
  context = "\n".join(retrieved_chunks)
124
 
125
  prompt = f"Gunakan informasi berikut untuk menjawab:\n{context}\n\nPertanyaan: {user_query}"
 
 
1
  from sentence_transformers import SentenceTransformer
2
  from PyPDF2 import PdfReader
3
  import tiktoken
 
12
  # == Buat folder models ==
13
  os.makedirs("models", exist_ok=True)
14
 
15
+ # == Load API Key dari File ==
16
  def load_api_key():
17
  with open("config.json", "r") as f:
18
  config = json.load(f)
 
22
 
23
  # == Ekstraksi Teks dari PDF ==
24
  def extract_text_from_pdf(pdf_file: str) -> str:
 
25
  with open(pdf_file, 'rb') as pdf:
26
  reader = PdfReader(pdf)
27
  text = " ".join(page.extract_text() or "" for page in reader.pages)
 
29
 
30
  # == Chunking Teks ==
31
  def chunk_text(text: str, max_tokens: int = 512) -> list:
32
+ tokenizer = tiktoken.get_encoding("cl100k_base")
 
33
  tokens = tokenizer.encode(text)
34
 
35
  chunks = []
 
40
 
41
  return chunks
42
 
43
+ # == Embedding dengan SentenceTransformer ==
44
+ model = SentenceTransformer('all-MiniLM-L6-v2') # Global model
45
+
46
  def get_embedding(text: str):
47
+ return np.array(model.encode(text), dtype=np.float32)
48
+
49
+ # == Setup FAISS ==
50
+ d = 384 # Dimensi embedding sesuai dengan model
51
+ index = faiss.IndexFlatL2(d)
 
 
 
52
  text_chunks = []
53
 
54
  def add_to_db(text_chunks_local):
 
55
  global text_chunks
56
+ text_chunks = text_chunks_local
57
+ embeddings = np.array([get_embedding(text) for text in text_chunks], dtype=np.float32).reshape(-1, d)
58
  index.add(embeddings)
59
 
60
  def search_db(query, k=5):
61
+ if index.ntotal == 0:
62
+ return ["Database masih kosong, silakan tambahkan data."]
63
+
64
  query_embedding = np.array([get_embedding(query)], dtype=np.float32).reshape(1, -1)
65
  distances, indices = index.search(query_embedding, k)
66
+ return [text_chunks[i] for i in indices[0] if i < len(text_chunks)]
67
 
68
  def save_to_faiss(index_path="vector_index.faiss"):
 
69
  faiss.write_index(index, index_path)
70
 
71
  def load_faiss(index_path="vector_index.faiss"):
 
72
  global index
73
  index = faiss.read_index(index_path)
74
 
 
75
  def save_embeddings(embeddings_path="models/embeddings.pkl"):
76
  with open(embeddings_path, "wb") as f:
77
  pickle.dump(index, f)
 
85
  client = groq.Client(api_key=GROQ_API_KEY)
86
 
87
  def query_llama(prompt):
 
88
  response = client.chat.completions.create(
89
  model="llama3-8b-8192",
90
  messages=[{"role": "user", "content": prompt}],
 
95
  # == Main Workflow ==
96
  if __name__ == '__main__':
97
  pdf_text = extract_text_from_pdf('dini_anggriyani_synthetic_data.pdf')
98
+ text_chunks = chunk_text(pdf_text, max_tokens=1024)
99
 
 
100
  add_to_db(text_chunks)
101
+ save_to_faiss()
102
+ save_embeddings()
103
 
 
104
  retrieved_chunks = search_db("Apa isi dokumen ini?")
105
  context = "\n".join(retrieved_chunks)
106
 
 
108
  answer = query_llama(prompt)
109
  print(answer)
110
 
111
+ # == Chatbot Interface ==
112
  def chatbot_interface(user_query):
113
+ retrieved_chunks = search_db(user_query)
114
  context = "\n".join(retrieved_chunks)
115
 
116
  prompt = f"Gunakan informasi berikut untuk menjawab:\n{context}\n\nPertanyaan: {user_query}"