cheryl19 commited on
Commit
7eb60d5
·
verified ·
1 Parent(s): ec75e10

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +62 -29
app.py CHANGED
@@ -3,6 +3,7 @@ import torch
3
  import faiss
4
  import numpy as np
5
  import gradio as gr
 
6
  from transformers import AutoTokenizer, AutoModel, pipeline
7
  from sklearn.preprocessing import normalize
8
 
@@ -27,15 +28,39 @@ DATA_DIR = "data"
27
  doc_chunks = {} # Stores chunks of documents: mata_kuliah -> [list of text chunks]
28
  doc_indexes = {} # Stores FAISS indexes for each mata_kuliah: mata_kuliah -> FAISS index
29
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
  # Process each text file in the data directory
31
  for fname in os.listdir(DATA_DIR):
32
  if fname.endswith(".txt"):
33
  matkul = os.path.splitext(fname)[0].upper() # Extract subject name from filename
34
  with open(os.path.join(DATA_DIR, fname), encoding='utf-8') as f:
35
- text = f.read()
 
 
 
36
  # Split document into chunks. Adjust chunk size (e.g., 300-700) based on content.
37
- # 500 characters is a good starting point.
38
- chunks = [text[i:i+500] for i in range(0, len(text), 500)]
 
39
  doc_chunks[matkul] = chunks
40
 
41
  # Generate embeddings for all chunks and normalize them
@@ -64,17 +89,18 @@ def rag_chat(matkul: str, question: str) -> str:
64
  query_embed = get_embedding(question)
65
  query_embed = normalize(query_embed.reshape(1, -1))
66
 
67
- # Search for top-k (e.g., 5) most similar chunks in the FAISS index
 
68
  D, I = doc_indexes[matkul].search(query_embed, k=5)
69
  context = "\n".join([doc_chunks[matkul][i] for i in I[0]])
70
 
71
  # --- Prompt Optimized for Extreme Conciseness and Directness ---
72
  # The prompt explicitly asks for ONLY the direct answer and nothing else.
73
- prompt = f"""Anda adalah asisten AI yang hanya akan memberikan jawaban paling langsung dan singkat dari informasi yang disediakan.
74
- **Jawablah pertanyaan berikut dengan satu atau dua kalimat saja, langsung pada intinya.**
75
- Jangan mengulang pertanyaan, menambahkan pendahuluan atau penutup, atau informasi lain di luar yang diminta.
76
- Fokus hanya pada definisi atau penjelasan paling relevan.
77
- Jika informasi tidak cukup, nyatakan "Informasi tidak ditemukan."
78
 
79
  Informasi Relevan dari mata kuliah {matkul}:
80
  {context}
@@ -85,15 +111,16 @@ Jawaban:"""
85
  # --- Text Generation Parameters Optimized for Conciseness ---
86
  # `max_new_tokens` is significantly reduced.
87
  # `temperature` is very low for highly deterministic output.
 
88
  output = llm(prompt,
89
- max_new_tokens=50, # Greatly reduced to enforce very short answers
90
  do_sample=True,
91
- temperature=0.4, # Very low temperature for highly focused and deterministic output
92
- top_k=10, # Narrow token selection
93
- top_p=0.95, # Less diversity, more precision
94
- pad_token_id=llm.tokenizer.eos_token_id # Ensures proper handling of padding tokens
 
95
  )[0]["generated_text"]
96
- )
97
 
98
  # --- Post-processing for Aggressive Cleanup and Deduplication ---
99
  # 1. Extract the generated answer by removing the prompt
@@ -103,25 +130,26 @@ Jawaban:"""
103
  # This list is designed to be general and NOT specific to content.
104
  general_unwanted_starters = [
105
  "Jawaban:", "Tujuan:", "Proses adalah:", "Definisi:", "Penjelasan:", "Hal ini adalah:",
106
- question.lower().strip(), # Remove the question itself if it's repeated
107
  "adalah", # If "adalah" stands alone as the start of an answer, it might be noise.
108
  "terdiri dari",
109
  "dapat diterjemahkan oleh",
110
  "bahasa mesin",
111
- "program"
 
 
112
  ]
113
 
114
- # Sort by length descending to remove longer matches first
115
  general_unwanted_starters.sort(key=len, reverse=True)
116
 
117
  for pattern in general_unwanted_starters:
118
  if generated_answer.lower().startswith(pattern.lower()):
119
  generated_answer = generated_answer[len(pattern):].strip()
120
- # If the answer becomes empty, stop trying to remove more
121
  if not generated_answer:
122
- break
123
 
124
- # 3. **General Deduplication of Consecutive Lines (Enhanced for conciseness)**
125
  lines = generated_answer.split('\n')
126
  cleaned_lines = []
127
  prev_line_stripped = ""
@@ -129,25 +157,30 @@ Jawaban:"""
129
  for line in lines:
130
  current_line_stripped = line.strip()
131
  # Add line if not empty and not a case-insensitive duplicate of the previous non-empty line
132
- # Also, check if it's a very short line that's just a common word
133
  if current_line_stripped and current_line_stripped.lower() != prev_line_stripped.lower():
134
- # Add a check for very short, common, standalone words if they appear as separate lines
135
- # This is to handle things like "PengertiAN" being on its own line if the context is like that.
136
- if len(current_line_stripped.split()) <= 2 and current_line_stripped.lower() in ["pengertian", "adalah", "tujuan", "proses", "terdiri"]:
137
  continue # Skip very short, non-substantive lines
138
  cleaned_lines.append(line)
139
  prev_line_stripped = current_line_stripped
140
 
141
  generated_answer = "\n".join(cleaned_lines).strip()
142
 
143
- # 4. Remove excessive blank lines and trailing characters
144
  generated_answer = os.linesep.join([s for s in generated_answer.splitlines() if s.strip()])
 
 
 
 
 
 
 
145
 
146
- # 5. Final check for very short/empty answers
147
- if not generated_answer or generated_answer.lower().strip() == "informasi tidak ditemukan." or len(generated_answer.split()) < 3:
148
  return "Informasi tidak ditemukan berdasarkan konteks yang relevan."
149
 
150
- return generated_answer.split('.')[0].strip() + '.' if '.' in generated_answer else generated_answer.strip() # Take only the first sentence if multiple exist
151
 
152
  # === 5. Gradio Interface ===
153
  interface = gr.Interface(
 
3
  import faiss
4
  import numpy as np
5
  import gradio as gr
6
+ import re # Import regex for advanced text cleaning
7
  from transformers import AutoTokenizer, AutoModel, pipeline
8
  from sklearn.preprocessing import normalize
9
 
 
28
  doc_chunks = {} # Stores chunks of documents: mata_kuliah -> [list of text chunks]
29
  doc_indexes = {} # Stores FAISS indexes for each mata_kuliah: mata_kuliah -> FAISS index
30
 
31
+ # Function to clean raw text from irrelevant patterns (moved here for clarity)
32
+ def clean_document_text(text: str) -> str:
33
+ """
34
+ Cleans document text by removing common irrelevant patterns like URLs, tags,
35
+ footers, headers, and excessive whitespace. This is crucial for accurate retrieval.
36
+ """
37
+ # Remove URLs
38
+ text = re.sub(r'http\S+|www\S+', '', text, flags=re.MULTILINE)
39
+ # Remove common irrelevant lines (e.g., source, tags, page numbers, navigation)
40
+ text = re.sub(r'Sumber:.*', '', text)
41
+ text = re.sub(r'Tags:.*', '', text)
42
+ text = re.sub(r'^\d+\s*pemikiran pada “.*”', '', text, flags=re.MULTILINE)
43
+ text = re.sub(r'←.*→', '', text)
44
+ text = re.sub(r'^\d+$', '', text, flags=re.MULTILINE) # Remove lines that are just numbers (like page numbers)
45
+
46
+ # Remove excessive spaces and normalize newlines
47
+ text = re.sub(r'\s+', ' ', text).strip() # Replace multiple spaces with single space
48
+ text = re.sub(r'\n+', '\n', text).strip() # Replace multiple newlines with single newline
49
+ return text
50
+
51
  # Process each text file in the data directory
52
  for fname in os.listdir(DATA_DIR):
53
  if fname.endswith(".txt"):
54
  matkul = os.path.splitext(fname)[0].upper() # Extract subject name from filename
55
  with open(os.path.join(DATA_DIR, fname), encoding='utf-8') as f:
56
+ raw_text = f.read()
57
+ # Apply cleaning BEFORE chunking and embedding
58
+ cleaned_text = clean_document_text(raw_text)
59
+
60
  # Split document into chunks. Adjust chunk size (e.g., 300-700) based on content.
61
+ # A smaller chunk size (e.g., 300) might be better if you want very concise answers
62
+ # and want to ensure a single relevant sentence isn't split across chunks.
63
+ chunks = [cleaned_text[i:i+300] for i in range(0, len(cleaned_text), 300)]
64
  doc_chunks[matkul] = chunks
65
 
66
  # Generate embeddings for all chunks and normalize them
 
89
  query_embed = get_embedding(question)
90
  query_embed = normalize(query_embed.reshape(1, -1))
91
 
92
+ # Search for top-k (e.g., 3 or 5) most similar chunks in the FAISS index
93
+ # K=5 is a good balance for capturing relevant context.
94
  D, I = doc_indexes[matkul].search(query_embed, k=5)
95
  context = "\n".join([doc_chunks[matkul][i] for i in I[0]])
96
 
97
  # --- Prompt Optimized for Extreme Conciseness and Directness ---
98
  # The prompt explicitly asks for ONLY the direct answer and nothing else.
99
+ # It strongly discourages extra text and encourages directness.
100
+ prompt = f"""Sebagai asisten AI, berikan jawaban **paling singkat dan langsung** untuk pertanyaan berikut.
101
+ Gunakan **hanya informasi dari bagian "Informasi Relevan"** di bawah ini.
102
+ Jangan mengulang pertanyaan, menambahkan kalimat pengantar/penutup, atau informasi lain.
103
+ Fokus pada inti definisi atau penjelasan yang diminta. Jika informasi tidak cukup, jawab "Informasi tidak ditemukan."
104
 
105
  Informasi Relevan dari mata kuliah {matkul}:
106
  {context}
 
111
  # --- Text Generation Parameters Optimized for Conciseness ---
112
  # `max_new_tokens` is significantly reduced.
113
  # `temperature` is very low for highly deterministic output.
114
+ # Using parameters recommended for IzzulGod/GPT2-Indo-chat-tuned for better balance.
115
  output = llm(prompt,
116
+ max_new_tokens=60, # Adjusted for IzzulGod model
117
  do_sample=True,
118
+ temperature=0.3, # Adjusted for IzzulGod model
119
+ top_k=20, # Adjusted for IzzulGod model
120
+ top_p=0.8, # Adjusted for IzzulGod model
121
+ pad_token_id=llm.tokenizer.eos_token_id,
122
+ num_return_sequences=1 # Ensure only one sequence is returned
123
  )[0]["generated_text"]
 
124
 
125
  # --- Post-processing for Aggressive Cleanup and Deduplication ---
126
  # 1. Extract the generated answer by removing the prompt
 
130
  # This list is designed to be general and NOT specific to content.
131
  general_unwanted_starters = [
132
  "Jawaban:", "Tujuan:", "Proses adalah:", "Definisi:", "Penjelasan:", "Hal ini adalah:",
133
+ question.lower().strip(), # Remove the question itself if it's repeated (case-insensitive)
134
  "adalah", # If "adalah" stands alone as the start of an answer, it might be noise.
135
  "terdiri dari",
136
  "dapat diterjemahkan oleh",
137
  "bahasa mesin",
138
+ "program",
139
+ "pengertian", # Specific term from your example that looks like noise
140
+ ":" # Sometimes a colon might be left
141
  ]
142
 
143
+ # Sort by length descending to remove longer matches first for effective removal
144
  general_unwanted_starters.sort(key=len, reverse=True)
145
 
146
  for pattern in general_unwanted_starters:
147
  if generated_answer.lower().startswith(pattern.lower()):
148
  generated_answer = generated_answer[len(pattern):].strip()
 
149
  if not generated_answer:
150
+ break # Stop if answer becomes empty after removal
151
 
152
+ # 3. General Deduplication of Consecutive Lines (Enhanced for conciseness)
153
  lines = generated_answer.split('\n')
154
  cleaned_lines = []
155
  prev_line_stripped = ""
 
157
  for line in lines:
158
  current_line_stripped = line.strip()
159
  # Add line if not empty and not a case-insensitive duplicate of the previous non-empty line
160
+ # Also, filter out very short, common words that might stand alone as separate lines.
161
  if current_line_stripped and current_line_stripped.lower() != prev_line_stripped.lower():
162
+ if len(current_line_stripped.split()) <= 2 and current_line_stripped.lower() in ["pengertian", "adalah", "tujuan", "proses", "terdiri", "bahasa", "mesin"]:
 
 
163
  continue # Skip very short, non-substantive lines
164
  cleaned_lines.append(line)
165
  prev_line_stripped = current_line_stripped
166
 
167
  generated_answer = "\n".join(cleaned_lines).strip()
168
 
169
+ # 4. Remove excessive blank lines and clean up whitespace (final pass)
170
  generated_answer = os.linesep.join([s for s in generated_answer.splitlines() if s.strip()])
171
+ generated_answer = re.sub(r'\s+', ' ', generated_answer).strip() # Replace multiple spaces with single
172
+
173
+ # 5. Take only the first sentence for extreme conciseness, if available
174
+ if '.' in generated_answer:
175
+ final_answer = generated_answer.split('.')[0].strip() + '.'
176
+ else:
177
+ final_answer = generated_answer.strip()
178
 
179
+ # 6. Final check for very short/empty answers or answers that are just the question
180
+ if not final_answer or final_answer.lower().strip() == "informasi tidak ditemukan." or len(final_answer.split()) < 3:
181
  return "Informasi tidak ditemukan berdasarkan konteks yang relevan."
182
 
183
+ return final_answer
184
 
185
  # === 5. Gradio Interface ===
186
  interface = gr.Interface(