Rifqidits commited on
Commit
cb9ff77
·
1 Parent(s): e2b7d51

Change to Bahasa Indonesia

Browse files
Files changed (2) hide show
  1. app.py +3 -20
  2. utils.py +5 -24
app.py CHANGED
@@ -14,11 +14,9 @@ from openpyxl import load_workbook
14
  from typing import List, Dict, Any, Tuple
15
  from utils import *
16
 
17
- # # === [1] Model and Tokenizer Loading ===
18
  # base_model_id = "NousResearch/Nous-Hermes-2-Mistral-7B-DPO"
19
  # lora_path = "tat-llm-final-e4"
20
 
21
- # # Load base model and LoRA adapter
22
  # base_model = AutoModelForCausalLM.from_pretrained(base_model_id, torch_dtype=torch.float16)
23
  # model = PeftModel.from_pretrained(base_model, lora_path)
24
 
@@ -26,23 +24,19 @@ from utils import *
26
  # model = model.to(device)
27
  # model.eval()
28
 
29
- # # Load tokenizer
30
  # tokenizer = AutoTokenizer.from_pretrained(lora_path)
31
 
32
- # === Updated Generate Answer Function ===
33
  @spaces.GPU(duration=60)
34
  def generate_answer(json_data: Dict[str, Any], question: str) -> str:
35
  """
36
  Generate answer using the fine-tuned model.
37
  """
38
- # === [1] Model and Tokenizer Loading ===
39
  base_model_id = "NousResearch/Nous-Hermes-2-Mistral-7B-DPO"
40
  lora_path = "tat-llm-final-e4"
41
 
42
  # Load base model and LoRA adapter
43
  base_model = AutoModelForCausalLM.from_pretrained(base_model_id, torch_dtype=torch.float16)
44
  model = PeftModel.from_pretrained(base_model, lora_path)
45
- # Load tokenizer
46
  tokenizer = AutoTokenizer.from_pretrained(lora_path)
47
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
48
  model = model.to(device)
@@ -55,7 +49,6 @@ def generate_answer(json_data: Dict[str, Any], question: str) -> str:
55
  device = next(model.parameters()).device
56
  inputs = {k: v.to(device) for k, v in inputs.items()}
57
 
58
- # Get input length to extract only generated text
59
  input_length = inputs["input_ids"].shape[1]
60
 
61
  with torch.no_grad():
@@ -67,13 +60,12 @@ def generate_answer(json_data: Dict[str, Any], question: str) -> str:
67
  pad_token_id=tokenizer.eos_token_id
68
  )
69
 
70
- # Decode only the generated part
71
  generated_tokens = outputs[0][input_length:]
72
  answer = tokenizer.decode(generated_tokens, skip_special_tokens=True)
73
 
74
  return answer
75
 
76
- # === Gradio Interface Functions ===
77
  def process_xlsx(file):
78
  """
79
  Process uploaded XLSX file and return JSON, JSONL, and Markdown.
@@ -82,10 +74,8 @@ def process_xlsx(file):
82
  return None, "", "", ""
83
 
84
  try:
85
- # Convert XLSX to JSON
86
  json_data = xlsx_to_json(file.name)
87
-
88
- # Generate different formats
89
  json_str = json.dumps(json_data, indent=2, ensure_ascii=False)
90
  jsonl_str = json_to_jsonl(json_data)
91
  markdown_str = json_to_markdown(json_data)
@@ -110,7 +100,7 @@ def chat_interface(json_data, question, history):
110
  except Exception as e:
111
  return history + [[question, f"Error generating answer: {str(e)}"]]
112
 
113
- # === Gradio UI ===
114
  with gr.Blocks(title="TAT-LLM: Semi-Tabular Data QA", theme=gr.themes.Soft()) as demo:
115
  gr.HTML("""
116
  <style>
@@ -132,12 +122,10 @@ with gr.Blocks(title="TAT-LLM: Semi-Tabular Data QA", theme=gr.themes.Soft()) as
132
  Sistem akan mengonversi berkas Anda ke format JSON dan menggunakan model TAT-LLM yang telah disempurnakan untuk menjawab pertanyaan.
133
  """)
134
 
135
- # State to store JSON data
136
  json_data_state = gr.State()
137
 
138
  with gr.Row():
139
  with gr.Column(scale=1):
140
- # File upload section
141
  file_input = gr.File(
142
  label="Upload XLSX File",
143
  file_types=[".xlsx"],
@@ -146,7 +134,6 @@ with gr.Blocks(title="TAT-LLM: Semi-Tabular Data QA", theme=gr.themes.Soft()) as
146
 
147
  process_btn = gr.Button("Process File", variant="primary")
148
 
149
- # Format display tabs
150
  with gr.Tabs():
151
  with gr.Tab("Markdown Preview"):
152
  markdown_output = gr.Markdown(label="Markdown Preview")
@@ -166,7 +153,6 @@ with gr.Blocks(title="TAT-LLM: Semi-Tabular Data QA", theme=gr.themes.Soft()) as
166
  )
167
 
168
  with gr.Column(scale=1):
169
- # Chat interface
170
  gr.Markdown("### Ajukan Pertanyaan Mengenai Data Anda")
171
  chatbot = gr.Chatbot(height=400)
172
  msg = gr.Textbox(
@@ -179,7 +165,6 @@ with gr.Blocks(title="TAT-LLM: Semi-Tabular Data QA", theme=gr.themes.Soft()) as
179
  submit_btn = gr.Button("Submit", variant="primary")
180
  clear_btn = gr.Button("Clear Chat")
181
 
182
- # Example questions
183
  gr.Examples(
184
  examples=[
185
  "Apa saja wawasan yang bisa kita ambil dari data ini?",
@@ -191,7 +176,6 @@ with gr.Blocks(title="TAT-LLM: Semi-Tabular Data QA", theme=gr.themes.Soft()) as
191
  inputs=msg
192
  )
193
 
194
- # Event handlers
195
  process_btn.click(
196
  fn=process_xlsx,
197
  inputs=[file_input],
@@ -221,6 +205,5 @@ with gr.Blocks(title="TAT-LLM: Semi-Tabular Data QA", theme=gr.themes.Soft()) as
221
  outputs=[chatbot]
222
  )
223
 
224
- # Launch the app
225
  if __name__ == "__main__":
226
  demo.queue().launch(share=True)
 
14
  from typing import List, Dict, Any, Tuple
15
  from utils import *
16
 
 
17
  # base_model_id = "NousResearch/Nous-Hermes-2-Mistral-7B-DPO"
18
  # lora_path = "tat-llm-final-e4"
19
 
 
20
  # base_model = AutoModelForCausalLM.from_pretrained(base_model_id, torch_dtype=torch.float16)
21
  # model = PeftModel.from_pretrained(base_model, lora_path)
22
 
 
24
  # model = model.to(device)
25
  # model.eval()
26
 
 
27
  # tokenizer = AutoTokenizer.from_pretrained(lora_path)
28
 
 
29
  @spaces.GPU(duration=60)
30
  def generate_answer(json_data: Dict[str, Any], question: str) -> str:
31
  """
32
  Generate answer using the fine-tuned model.
33
  """
 
34
  base_model_id = "NousResearch/Nous-Hermes-2-Mistral-7B-DPO"
35
  lora_path = "tat-llm-final-e4"
36
 
37
  # Load base model and LoRA adapter
38
  base_model = AutoModelForCausalLM.from_pretrained(base_model_id, torch_dtype=torch.float16)
39
  model = PeftModel.from_pretrained(base_model, lora_path)
 
40
  tokenizer = AutoTokenizer.from_pretrained(lora_path)
41
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
42
  model = model.to(device)
 
49
  device = next(model.parameters()).device
50
  inputs = {k: v.to(device) for k, v in inputs.items()}
51
 
 
52
  input_length = inputs["input_ids"].shape[1]
53
 
54
  with torch.no_grad():
 
60
  pad_token_id=tokenizer.eos_token_id
61
  )
62
 
 
63
  generated_tokens = outputs[0][input_length:]
64
  answer = tokenizer.decode(generated_tokens, skip_special_tokens=True)
65
 
66
  return answer
67
 
68
+ # Gradio interface functions
69
  def process_xlsx(file):
70
  """
71
  Process uploaded XLSX file and return JSON, JSONL, and Markdown.
 
74
  return None, "", "", ""
75
 
76
  try:
 
77
  json_data = xlsx_to_json(file.name)
78
+
 
79
  json_str = json.dumps(json_data, indent=2, ensure_ascii=False)
80
  jsonl_str = json_to_jsonl(json_data)
81
  markdown_str = json_to_markdown(json_data)
 
100
  except Exception as e:
101
  return history + [[question, f"Error generating answer: {str(e)}"]]
102
 
103
+ # Gradio UI
104
  with gr.Blocks(title="TAT-LLM: Semi-Tabular Data QA", theme=gr.themes.Soft()) as demo:
105
  gr.HTML("""
106
  <style>
 
122
  Sistem akan mengonversi berkas Anda ke format JSON dan menggunakan model TAT-LLM yang telah disempurnakan untuk menjawab pertanyaan.
123
  """)
124
 
 
125
  json_data_state = gr.State()
126
 
127
  with gr.Row():
128
  with gr.Column(scale=1):
 
129
  file_input = gr.File(
130
  label="Upload XLSX File",
131
  file_types=[".xlsx"],
 
134
 
135
  process_btn = gr.Button("Process File", variant="primary")
136
 
 
137
  with gr.Tabs():
138
  with gr.Tab("Markdown Preview"):
139
  markdown_output = gr.Markdown(label="Markdown Preview")
 
153
  )
154
 
155
  with gr.Column(scale=1):
 
156
  gr.Markdown("### Ajukan Pertanyaan Mengenai Data Anda")
157
  chatbot = gr.Chatbot(height=400)
158
  msg = gr.Textbox(
 
165
  submit_btn = gr.Button("Submit", variant="primary")
166
  clear_btn = gr.Button("Clear Chat")
167
 
 
168
  gr.Examples(
169
  examples=[
170
  "Apa saja wawasan yang bisa kita ambil dari data ini?",
 
176
  inputs=msg
177
  )
178
 
 
179
  process_btn.click(
180
  fn=process_xlsx,
181
  inputs=[file_input],
 
205
  outputs=[chatbot]
206
  )
207
 
 
208
  if __name__ == "__main__":
209
  demo.queue().launch(share=True)
utils.py CHANGED
@@ -3,7 +3,7 @@ import json
3
  from typing import List, Dict, Any, Tuple
4
  from openpyxl import load_workbook
5
 
6
- # === XLSX to JSON Conversion Functions ===
7
  def detect_table_and_paragraphs(worksheet) -> Tuple[List[List[str]], List[Dict[str, Any]]]:
8
  data = []
9
  max_col = worksheet.max_column
@@ -13,7 +13,6 @@ def detect_table_and_paragraphs(worksheet) -> Tuple[List[List[str]], List[Dict[s
13
  if any(cell is not None for cell in row):
14
  data.append([str(cell).strip() if cell is not None else "" for cell in row])
15
 
16
- # Try detecting start of a table
17
  table_data = []
18
  paragraph_texts = []
19
  in_table = False
@@ -25,7 +24,6 @@ def detect_table_and_paragraphs(worksheet) -> Tuple[List[List[str]], List[Dict[s
25
  in_table = True
26
  table_data.append(row)
27
  elif in_table and len(non_empty) >= 2:
28
- # Continue table (in case of header rows or descriptive rows)
29
  table_data.append(row)
30
  else:
31
  paragraph = " ".join(non_empty)
@@ -44,65 +42,48 @@ def detect_table_and_paragraphs(worksheet) -> Tuple[List[List[str]], List[Dict[s
44
  return table_data, paragraphs
45
 
46
  def xlsx_to_json(file_path) -> Dict[str, Any]:
47
- """
48
- Convert XLSX file to TAT-QA JSON format.
49
- """
50
  workbook = load_workbook(file_path, data_only=True)
51
  worksheet = workbook.active
52
 
53
- # Extract table and paragraphs
54
  table_data, paragraphs = detect_table_and_paragraphs(worksheet)
55
 
56
- # Create JSON structure
57
  json_data = {
58
  "table": {
59
  "uid": str(uuid.uuid4()),
60
  "table": table_data
61
  },
62
  "paragraphs": paragraphs,
63
- "questions": [] # Empty for user to fill later
64
  }
65
 
66
  return json_data
67
 
68
  def json_to_jsonl(json_data: Dict[str, Any]) -> str:
69
- """
70
- Convert JSON to JSONL format (one JSON object per line).
71
- """
72
  return json.dumps(json_data, ensure_ascii=False)
73
 
74
  def json_to_markdown(json_data: Dict[str, Any]) -> str:
75
- """
76
- Convert JSON data to markdown format for display.
77
- """
78
- markdown_content = "## Table Data\n\n"
79
 
80
  # Convert table to markdown
81
  table = json_data["table"]["table"]
82
  if table:
83
- # Create markdown table
84
  markdown_content += "| " + " | ".join(table[0]) + " |\n"
85
  markdown_content += "| " + " | ".join(["---"] * len(table[0])) + " |\n"
86
  for row in table[1:]:
87
  markdown_content += "| " + " | ".join(row) + " |\n"
88
 
89
  # Add paragraphs
90
- markdown_content += "\n## Context/Paragraphs\n\n"
91
  for para in json_data["paragraphs"]:
92
  markdown_content += f"{para['order']}. {para['text']}\n\n"
93
 
94
  return markdown_content
95
 
96
- # === Updated Prompt Creation Function ===
97
  def create_prompt(table_data: Dict[str, Any], question: str) -> str:
98
- """
99
- Create prompt in the same format as training data.
100
- """
101
- # Convert table to markdown format
102
  table = table_data["table"]["table"]
103
  table_md = "\n".join(["| " + " | ".join(row) + " |" for row in table])
104
 
105
- # Extract paragraph texts
106
  text_content = "\n".join([p["text"] for p in table_data["paragraphs"]])
107
 
108
  prompt = f"""### Instruction
 
3
  from typing import List, Dict, Any, Tuple
4
  from openpyxl import load_workbook
5
 
6
+ # XLSX to JSON conversion functions
7
  def detect_table_and_paragraphs(worksheet) -> Tuple[List[List[str]], List[Dict[str, Any]]]:
8
  data = []
9
  max_col = worksheet.max_column
 
13
  if any(cell is not None for cell in row):
14
  data.append([str(cell).strip() if cell is not None else "" for cell in row])
15
 
 
16
  table_data = []
17
  paragraph_texts = []
18
  in_table = False
 
24
  in_table = True
25
  table_data.append(row)
26
  elif in_table and len(non_empty) >= 2:
 
27
  table_data.append(row)
28
  else:
29
  paragraph = " ".join(non_empty)
 
42
  return table_data, paragraphs
43
 
44
  def xlsx_to_json(file_path) -> Dict[str, Any]:
 
 
 
45
  workbook = load_workbook(file_path, data_only=True)
46
  worksheet = workbook.active
47
 
 
48
  table_data, paragraphs = detect_table_and_paragraphs(worksheet)
49
 
 
50
  json_data = {
51
  "table": {
52
  "uid": str(uuid.uuid4()),
53
  "table": table_data
54
  },
55
  "paragraphs": paragraphs,
56
+ "questions": []
57
  }
58
 
59
  return json_data
60
 
61
  def json_to_jsonl(json_data: Dict[str, Any]) -> str:
 
 
 
62
  return json.dumps(json_data, ensure_ascii=False)
63
 
64
  def json_to_markdown(json_data: Dict[str, Any]) -> str:
65
+ markdown_content = "## Data Tabel\n\n"
 
 
 
66
 
67
  # Convert table to markdown
68
  table = json_data["table"]["table"]
69
  if table:
 
70
  markdown_content += "| " + " | ".join(table[0]) + " |\n"
71
  markdown_content += "| " + " | ".join(["---"] * len(table[0])) + " |\n"
72
  for row in table[1:]:
73
  markdown_content += "| " + " | ".join(row) + " |\n"
74
 
75
  # Add paragraphs
76
+ markdown_content += "\n## Konteks/Paragraf\n\n"
77
  for para in json_data["paragraphs"]:
78
  markdown_content += f"{para['order']}. {para['text']}\n\n"
79
 
80
  return markdown_content
81
 
82
+ # Prompt creation function
83
  def create_prompt(table_data: Dict[str, Any], question: str) -> str:
 
 
 
 
84
  table = table_data["table"]["table"]
85
  table_md = "\n".join(["| " + " | ".join(row) + " |" for row in table])
86
 
 
87
  text_content = "\n".join([p["text"] for p in table_data["paragraphs"]])
88
 
89
  prompt = f"""### Instruction