Spaces:

cheryl19
/

ITBOT

Sleeping

App Files Files Community

cheryl19 commited on Jun 23, 2025

Commit

7eb60d5

verified ·

1 Parent(s): ec75e10

Update app.py

Browse files

Files changed (1) hide show

app.py +62 -29

app.py CHANGED Viewed

@@ -3,6 +3,7 @@ import torch
 import faiss
 import numpy as np
 import gradio as gr
 from transformers import AutoTokenizer, AutoModel, pipeline
 from sklearn.preprocessing import normalize
@@ -27,15 +28,39 @@ DATA_DIR = "data"
 doc_chunks = {}     # Stores chunks of documents: mata_kuliah -> [list of text chunks]
 doc_indexes = {}    # Stores FAISS indexes for each mata_kuliah: mata_kuliah -> FAISS index
 # Process each text file in the data directory
 for fname in os.listdir(DATA_DIR):
     if fname.endswith(".txt"):
         matkul = os.path.splitext(fname)[0].upper() # Extract subject name from filename
         with open(os.path.join(DATA_DIR, fname), encoding='utf-8') as f:
-            text = f.read()
             # Split document into chunks. Adjust chunk size (e.g., 300-700) based on content.
-            # 500 characters is a good starting point.
-            chunks = [text[i:i+500] for i in range(0, len(text), 500)]
             doc_chunks[matkul] = chunks
             # Generate embeddings for all chunks and normalize them
@@ -64,17 +89,18 @@ def rag_chat(matkul: str, question: str) -> str:
     query_embed = get_embedding(question)
     query_embed = normalize(query_embed.reshape(1, -1))
-    # Search for top-k (e.g., 5) most similar chunks in the FAISS index
     D, I = doc_indexes[matkul].search(query_embed, k=5)
     context = "\n".join([doc_chunks[matkul][i] for i in I[0]])
     # --- Prompt Optimized for Extreme Conciseness and Directness ---
     # The prompt explicitly asks for ONLY the direct answer and nothing else.
-    prompt = f"""Anda adalah asisten AI yang hanya akan memberikan jawaban paling langsung dan singkat dari informasi yang disediakan.
-**Jawablah pertanyaan berikut dengan satu atau dua kalimat saja, langsung pada intinya.**
-Jangan mengulang pertanyaan, menambahkan pendahuluan atau penutup, atau informasi lain di luar yang diminta.
-Fokus hanya pada definisi atau penjelasan paling relevan.
-Jika informasi tidak cukup, nyatakan "Informasi tidak ditemukan."
 Informasi Relevan dari mata kuliah {matkul}:
 {context}
@@ -85,15 +111,16 @@ Jawaban:"""
     # --- Text Generation Parameters Optimized for Conciseness ---
     # `max_new_tokens` is significantly reduced.
     # `temperature` is very low for highly deterministic output.
     output = llm(prompt,
-                 max_new_tokens=50, # Greatly reduced to enforce very short answers
                  do_sample=True,
-                 temperature=0.4,    # Very low temperature for highly focused and deterministic output
-                 top_k=10,           # Narrow token selection
-                 top_p=0.95,          # Less diversity, more precision
-                 pad_token_id=llm.tokenizer.eos_token_id # Ensures proper handling of padding tokens
                 )[0]["generated_text"]
-)
     # --- Post-processing for Aggressive Cleanup and Deduplication ---
     # 1. Extract the generated answer by removing the prompt
@@ -103,25 +130,26 @@ Jawaban:"""
     # This list is designed to be general and NOT specific to content.
     general_unwanted_starters = [
         "Jawaban:", "Tujuan:", "Proses adalah:", "Definisi:", "Penjelasan:", "Hal ini adalah:",
-        question.lower().strip(), # Remove the question itself if it's repeated
         "adalah", # If "adalah" stands alone as the start of an answer, it might be noise.
         "terdiri dari",
         "dapat diterjemahkan oleh",
         "bahasa mesin",
-        "program"
     ]
-    # Sort by length descending to remove longer matches first
     general_unwanted_starters.sort(key=len, reverse=True)
     for pattern in general_unwanted_starters:
         if generated_answer.lower().startswith(pattern.lower()):
             generated_answer = generated_answer[len(pattern):].strip()
-            # If the answer becomes empty, stop trying to remove more
             if not generated_answer:
-                break
-    # 3. **General Deduplication of Consecutive Lines (Enhanced for conciseness)**
     lines = generated_answer.split('\n')
     cleaned_lines = []
     prev_line_stripped = ""
@@ -129,25 +157,30 @@ Jawaban:"""
     for line in lines:
         current_line_stripped = line.strip()
         # Add line if not empty and not a case-insensitive duplicate of the previous non-empty line
-        # Also, check if it's a very short line that's just a common word
         if current_line_stripped and current_line_stripped.lower() != prev_line_stripped.lower():
-            # Add a check for very short, common, standalone words if they appear as separate lines
-            # This is to handle things like "PengertiAN" being on its own line if the context is like that.
-            if len(current_line_stripped.split()) <= 2 and current_line_stripped.lower() in ["pengertian", "adalah", "tujuan", "proses", "terdiri"]:
                 continue # Skip very short, non-substantive lines
             cleaned_lines.append(line)
         prev_line_stripped = current_line_stripped
     generated_answer = "\n".join(cleaned_lines).strip()
-    # 4. Remove excessive blank lines and trailing characters
     generated_answer = os.linesep.join([s for s in generated_answer.splitlines() if s.strip()])
-    # 5. Final check for very short/empty answers
-    if not generated_answer or generated_answer.lower().strip() == "informasi tidak ditemukan." or len(generated_answer.split()) < 3:
         return "Informasi tidak ditemukan berdasarkan konteks yang relevan."
-    return generated_answer.split('.')[0].strip() + '.' if '.' in generated_answer else generated_answer.strip() # Take only the first sentence if multiple exist
 # === 5. Gradio Interface ===
 interface = gr.Interface(

 import faiss
 import numpy as np
 import gradio as gr
+import re  # Import regex for advanced text cleaning
 from transformers import AutoTokenizer, AutoModel, pipeline
 from sklearn.preprocessing import normalize
 doc_chunks = {}     # Stores chunks of documents: mata_kuliah -> [list of text chunks]
 doc_indexes = {}    # Stores FAISS indexes for each mata_kuliah: mata_kuliah -> FAISS index
+# Function to clean raw text from irrelevant patterns (moved here for clarity)
+def clean_document_text(text: str) -> str:
+    """
+    Cleans document text by removing common irrelevant patterns like URLs, tags,
+    footers, headers, and excessive whitespace. This is crucial for accurate retrieval.
+    """
+    # Remove URLs
+    text = re.sub(r'http\S+|www\S+', '', text, flags=re.MULTILINE)
+    # Remove common irrelevant lines (e.g., source, tags, page numbers, navigation)
+    text = re.sub(r'Sumber:.*', '', text)
+    text = re.sub(r'Tags:.*', '', text)
+    text = re.sub(r'^\d+\s*pemikiran pada “.*”', '', text, flags=re.MULTILINE)
+    text = re.sub(r'←.*→', '', text)
+    text = re.sub(r'^\d+$', '', text, flags=re.MULTILINE) # Remove lines that are just numbers (like page numbers)
+    # Remove excessive spaces and normalize newlines
+    text = re.sub(r'\s+', ' ', text).strip() # Replace multiple spaces with single space
+    text = re.sub(r'\n+', '\n', text).strip() # Replace multiple newlines with single newline
+    return text
 # Process each text file in the data directory
 for fname in os.listdir(DATA_DIR):
     if fname.endswith(".txt"):
         matkul = os.path.splitext(fname)[0].upper() # Extract subject name from filename
         with open(os.path.join(DATA_DIR, fname), encoding='utf-8') as f:
+            raw_text = f.read()
+            # Apply cleaning BEFORE chunking and embedding
+            cleaned_text = clean_document_text(raw_text)
             # Split document into chunks. Adjust chunk size (e.g., 300-700) based on content.
+            # A smaller chunk size (e.g., 300) might be better if you want very concise answers
+            # and want to ensure a single relevant sentence isn't split across chunks.
+            chunks = [cleaned_text[i:i+300] for i in range(0, len(cleaned_text), 300)]
             doc_chunks[matkul] = chunks
             # Generate embeddings for all chunks and normalize them
     query_embed = get_embedding(question)
     query_embed = normalize(query_embed.reshape(1, -1))
+    # Search for top-k (e.g., 3 or 5) most similar chunks in the FAISS index
+    # K=5 is a good balance for capturing relevant context.
     D, I = doc_indexes[matkul].search(query_embed, k=5)
     context = "\n".join([doc_chunks[matkul][i] for i in I[0]])
     # --- Prompt Optimized for Extreme Conciseness and Directness ---
     # The prompt explicitly asks for ONLY the direct answer and nothing else.
+    # It strongly discourages extra text and encourages directness.
+    prompt = f"""Sebagai asisten AI, berikan jawaban **paling singkat dan langsung** untuk pertanyaan berikut.
+Gunakan **hanya informasi dari bagian "Informasi Relevan"** di bawah ini.
+Jangan mengulang pertanyaan, menambahkan kalimat pengantar/penutup, atau informasi lain.
+Fokus pada inti definisi atau penjelasan yang diminta. Jika informasi tidak cukup, jawab "Informasi tidak ditemukan."
 Informasi Relevan dari mata kuliah {matkul}:
 {context}
     # --- Text Generation Parameters Optimized for Conciseness ---
     # `max_new_tokens` is significantly reduced.
     # `temperature` is very low for highly deterministic output.
+    # Using parameters recommended for IzzulGod/GPT2-Indo-chat-tuned for better balance.
     output = llm(prompt,
+                 max_new_tokens=60,  # Adjusted for IzzulGod model
                  do_sample=True,
+                 temperature=0.3,    # Adjusted for IzzulGod model
+                 top_k=20,           # Adjusted for IzzulGod model
+                 top_p=0.8,          # Adjusted for IzzulGod model
+                 pad_token_id=llm.tokenizer.eos_token_id,
+                 num_return_sequences=1 # Ensure only one sequence is returned
                 )[0]["generated_text"]
     # --- Post-processing for Aggressive Cleanup and Deduplication ---
     # 1. Extract the generated answer by removing the prompt
     # This list is designed to be general and NOT specific to content.
     general_unwanted_starters = [
         "Jawaban:", "Tujuan:", "Proses adalah:", "Definisi:", "Penjelasan:", "Hal ini adalah:",
+        question.lower().strip(), # Remove the question itself if it's repeated (case-insensitive)
         "adalah", # If "adalah" stands alone as the start of an answer, it might be noise.
         "terdiri dari",
         "dapat diterjemahkan oleh",
         "bahasa mesin",
+        "program",
+        "pengertian", # Specific term from your example that looks like noise
+        ":" # Sometimes a colon might be left
     ]
+    # Sort by length descending to remove longer matches first for effective removal
     general_unwanted_starters.sort(key=len, reverse=True)
     for pattern in general_unwanted_starters:
         if generated_answer.lower().startswith(pattern.lower()):
             generated_answer = generated_answer[len(pattern):].strip()
             if not generated_answer:
+                break # Stop if answer becomes empty after removal
+    # 3. General Deduplication of Consecutive Lines (Enhanced for conciseness)
     lines = generated_answer.split('\n')
     cleaned_lines = []
     prev_line_stripped = ""
     for line in lines:
         current_line_stripped = line.strip()
         # Add line if not empty and not a case-insensitive duplicate of the previous non-empty line
+        # Also, filter out very short, common words that might stand alone as separate lines.
         if current_line_stripped and current_line_stripped.lower() != prev_line_stripped.lower():
+            if len(current_line_stripped.split()) <= 2 and current_line_stripped.lower() in ["pengertian", "adalah", "tujuan", "proses", "terdiri", "bahasa", "mesin"]:
                 continue # Skip very short, non-substantive lines
             cleaned_lines.append(line)
         prev_line_stripped = current_line_stripped
     generated_answer = "\n".join(cleaned_lines).strip()
+    # 4. Remove excessive blank lines and clean up whitespace (final pass)
     generated_answer = os.linesep.join([s for s in generated_answer.splitlines() if s.strip()])
+    generated_answer = re.sub(r'\s+', ' ', generated_answer).strip() # Replace multiple spaces with single
+    # 5. Take only the first sentence for extreme conciseness, if available
+    if '.' in generated_answer:
+        final_answer = generated_answer.split('.')[0].strip() + '.'
+    else:
+        final_answer = generated_answer.strip()
+    # 6. Final check for very short/empty answers or answers that are just the question
+    if not final_answer or final_answer.lower().strip() == "informasi tidak ditemukan." or len(final_answer.split()) < 3:
         return "Informasi tidak ditemukan berdasarkan konteks yang relevan."
+    return final_answer
 # === 5. Gradio Interface ===
 interface = gr.Interface(