| import os |
| import torch |
| import faiss |
| import numpy as np |
| import gradio as gr |
| import re |
| from transformers import AutoTokenizer, AutoModel, pipeline |
| from sklearn.preprocessing import normalize |
|
|
| |
| embed_tokenizer = AutoTokenizer.from_pretrained("indobenchmark/indobert-base-p1") |
| embed_model = AutoModel.from_pretrained("indobenchmark/indobert-base-p1") |
|
|
| def get_embedding(text): |
| """Generates an embedding for the given text using IndoBERT.""" |
| inputs = embed_tokenizer(text, return_tensors="pt", truncation=True, padding=True) |
| with torch.no_grad(): |
| outputs = embed_model(**inputs) |
| embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy() |
| return embedding |
|
|
| |
| |
| llm = pipeline("text-generation", model="IzzulGod/GPT2-Indo-chat-tuned") |
|
|
| |
| DATA_DIR = "data" |
| doc_chunks = {} |
| doc_indexes = {} |
|
|
| |
| def clean_document_text(text: str) -> str: |
| """ |
| Cleans document text by removing common irrelevant patterns like URLs, tags, |
| footers, headers, and excessive whitespace. This is crucial for accurate retrieval. |
| """ |
| |
| text = re.sub(r'http\S+|www\S+', '', text, flags=re.MULTILINE) |
| |
| text = re.sub(r'Sumber:.*', '', text) |
| text = re.sub(r'Tags:.*', '', text) |
| text = re.sub(r'^\d+\s*pemikiran pada “.*”', '', text, flags=re.MULTILINE) |
| text = re.sub(r'←.*→', '', text) |
| text = re.sub(r'^\d+$', '', text, flags=re.MULTILINE) |
| |
| |
| text = re.sub(r'\s+', ' ', text).strip() |
| text = re.sub(r'\n+', '\n', text).strip() |
| return text |
|
|
| |
| for fname in os.listdir(DATA_DIR): |
| if fname.endswith(".txt"): |
| matkul = os.path.splitext(fname)[0].upper() |
| with open(os.path.join(DATA_DIR, fname), encoding='utf-8') as f: |
| raw_text = f.read() |
| |
| cleaned_text = clean_document_text(raw_text) |
| |
| |
| |
| |
| chunks = [cleaned_text[i:i+300] for i in range(0, len(cleaned_text), 300)] |
| doc_chunks[matkul] = chunks |
| |
| |
| embeddings = np.array([get_embedding(chunk) for chunk in chunks]) |
| embeddings = normalize(embeddings) |
| |
| |
| index = faiss.IndexFlatL2(embeddings.shape[1]) |
| index.add(embeddings) |
| doc_indexes[matkul] = index |
|
|
| |
| def rag_chat(matkul: str, question: str) -> str: |
| """ |
| Retrieves relevant context and generates a concise, relevant answer using LLM. |
| Args: |
| matkul (str): The selected subject (mata kuliah). |
| question (str): The user's question. |
| Returns: |
| str: The generated answer, cleaned and deduplicated to be very concise. |
| """ |
| if matkul not in doc_indexes: |
| return "Mata kuliah tidak ditemukan." |
|
|
| |
| query_embed = get_embedding(question) |
| query_embed = normalize(query_embed.reshape(1, -1)) |
|
|
| |
| |
| D, I = doc_indexes[matkul].search(query_embed, k=5) |
| context = "\n".join([doc_chunks[matkul][i] for i in I[0]]) |
|
|
| |
| |
| |
| prompt = f"""Sebagai asisten AI, berikan jawaban **paling singkat dan langsung** untuk pertanyaan berikut. |
| Gunakan **hanya informasi dari bagian "Informasi Relevan"** di bawah ini. |
| Jangan mengulang pertanyaan, menambahkan kalimat pengantar/penutup, atau informasi lain. |
| Fokus pada inti definisi atau penjelasan yang diminta. Jika informasi tidak cukup, jawab "Informasi tidak ditemukan." |
| |
| Informasi Relevan dari mata kuliah {matkul}: |
| {context} |
| |
| Pertanyaan: {question} |
| Jawaban:""" |
|
|
| |
| |
| |
| |
| output = llm(prompt, |
| max_new_tokens=60, |
| do_sample=True, |
| temperature=0.3, |
| top_k=20, |
| top_p=0.8, |
| pad_token_id=llm.tokenizer.eos_token_id, |
| num_return_sequences=1 |
| )[0]["generated_text"] |
|
|
| |
| |
| generated_answer = output[len(prompt):].strip() |
|
|
| |
| |
| general_unwanted_starters = [ |
| "Jawaban:", "Tujuan:", "Proses adalah:", "Definisi:", "Penjelasan:", "Hal ini adalah:", |
| question.lower().strip(), |
| "adalah", |
| "terdiri dari", |
| "dapat diterjemahkan oleh", |
| "bahasa mesin", |
| "program", |
| "pengertian", |
| ":" |
| ] |
| |
| |
| general_unwanted_starters.sort(key=len, reverse=True) |
|
|
| for pattern in general_unwanted_starters: |
| if generated_answer.lower().startswith(pattern.lower()): |
| generated_answer = generated_answer[len(pattern):].strip() |
| if not generated_answer: |
| break |
| |
| |
| lines = generated_answer.split('\n') |
| cleaned_lines = [] |
| prev_line_stripped = "" |
|
|
| for line in lines: |
| current_line_stripped = line.strip() |
| |
| |
| if current_line_stripped and current_line_stripped.lower() != prev_line_stripped.lower(): |
| if len(current_line_stripped.split()) <= 2 and current_line_stripped.lower() in ["pengertian", "adalah", "tujuan", "proses", "terdiri", "bahasa", "mesin"]: |
| continue |
| cleaned_lines.append(line) |
| prev_line_stripped = current_line_stripped |
| |
| generated_answer = "\n".join(cleaned_lines).strip() |
|
|
| |
| generated_answer = os.linesep.join([s for s in generated_answer.splitlines() if s.strip()]) |
| generated_answer = re.sub(r'\s+', ' ', generated_answer).strip() |
|
|
| |
| if '.' in generated_answer: |
| final_answer = generated_answer.split('.')[0].strip() + '.' |
| else: |
| final_answer = generated_answer.strip() |
|
|
| |
| if not final_answer or final_answer.lower().strip() == "informasi tidak ditemukan." or len(final_answer.split()) < 3: |
| return "Informasi tidak ditemukan berdasarkan konteks yang relevan." |
| |
| return final_answer |
|
|
| |
| interface = gr.Interface( |
| fn=rag_chat, |
| inputs=[ |
| gr.Dropdown(choices=list(doc_chunks.keys()), label="Pilih Mata Kuliah"), |
| gr.Textbox(label="Pertanyaan Anda") |
| ], |
| outputs=gr.Textbox(label="Jawaban"), |
| title="Chatbot RAG & LLM Mata Kuliah", |
| description="Tanyakan sesuatu berdasarkan materi tiap mata kuliah.") |
|
|
| if __name__ == "__main__": |
| interface.launch() |