MashiroLn commited on
Commit
96ec759
·
verified ·
1 Parent(s): ad6c225

Upload folder using huggingface_hub

Browse files
Files changed (2) hide show
  1. app.py +5 -1
  2. apps/json_editor.py +283 -0
app.py CHANGED
@@ -1,5 +1,5 @@
1
  import gradio as gr
2
- from apps import pdf_cropper, text_tools, paper_image_tool
3
 
4
  def create_main_interface():
5
  with gr.Blocks(title="我的科研工具箱") as main_app:
@@ -19,6 +19,10 @@ def create_main_interface():
19
  # --- 工具 3: 科研配图助手 ---
20
  with gr.TabItem("📑 Image Auto Cropper"):
21
  paper_image_tool.create_paper_tool()
 
 
 
 
22
 
23
  # --- 可以在这里继续添加更多 Tab ---
24
 
 
1
  import gradio as gr
2
+ from apps import pdf_cropper, text_tools, paper_image_tool, json_editor
3
 
4
  def create_main_interface():
5
  with gr.Blocks(title="我的科研工具箱") as main_app:
 
19
  # --- 工具 3: 科研配图助手 ---
20
  with gr.TabItem("📑 Image Auto Cropper"):
21
  paper_image_tool.create_paper_tool()
22
+
23
+ # --- 工具 4: JSON 编辑器 ---
24
+ with gr.TabItem("⚡ Fastest JSON Editor"):
25
+ json_editor.create_ui()
26
 
27
  # --- 可以在这里继续添加更多 Tab ---
28
 
apps/json_editor.py ADDED
@@ -0,0 +1,283 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import json
3
+ import requests
4
+ import pandas as pd
5
+ import traceback
6
+
7
+ # --- LLM Configuration ---
8
+ LLM_API_KEY = "sk-fa6c38ce957e4c7b946ccbeed33237ec"
9
+ LLM_API_URL = "https://api.deepseek.com/v1/chat/completions"
10
+
11
+ def call_llm(prompt, system_prompt="You are a helpful assistant."):
12
+ headers = {
13
+ "Authorization": f"Bearer {LLM_API_KEY}",
14
+ "Content-Type": "application/json"
15
+ }
16
+ data = {
17
+ "model": "deepseek-chat",
18
+ "messages": [
19
+ {"role": "system", "content": system_prompt},
20
+ {"role": "user", "content": prompt}
21
+ ],
22
+ "stream": False
23
+ }
24
+ try:
25
+ response = requests.post(LLM_API_URL, headers=headers, json=data, timeout=60)
26
+ response.raise_for_status()
27
+ return response.json()['choices'][0]['message']['content']
28
+ except Exception as e:
29
+ return f"Error: {str(e)}"
30
+
31
+ def analyze_json_structure(json_input):
32
+ try:
33
+ # Try parsing as JSON
34
+ data = json.loads(json_input)
35
+ except:
36
+ # Try parsing as JSONL (first line)
37
+ try:
38
+ data = json.loads(json_input.strip().split('\n')[0])
39
+ except Exception as e:
40
+ return [], f"Parse Error: {e}"
41
+
42
+ prompt = f"""
43
+ Analyze this JSON item from an SFT dataset:
44
+ {json.dumps(data, indent=2)}
45
+
46
+ 1. Identify all fields, their types, and a short sample value.
47
+ 2. For each field, suggest 1-3 common data cleaning/modification actions relevant to SFT (e.g., "Normalize score", "Remove 'User:' prefix", "Fix HTML entities", "Delete if empty").
48
+ 3. Return ONLY a JSON list of objects with keys: "field", "type", "sample", "suggestions" (list of strings).
49
+ """
50
+
51
+ response = call_llm(prompt, "You are a data engineering expert.")
52
+
53
+ # Clean response
54
+ if "```json" in response:
55
+ response = response.split("```json")[1].split("```")[0]
56
+ elif "```" in response:
57
+ response = response.split("```")[1].split("```")[0]
58
+
59
+ try:
60
+ analysis = json.loads(response.strip())
61
+ # Convert to list of dicts for DataFrame
62
+ return analysis, "Analysis Complete"
63
+ except Exception as e:
64
+ return [], f"LLM Parse Error: {e}\nRaw: {response}"
65
+
66
+ def generate_transform_code(json_sample, rules):
67
+ # rules is a list of dicts: [{'field': 'x', 'action': 'y', 'custom': 'z'}]
68
+
69
+ prompt = f"""
70
+ I have a JSON item structure like this:
71
+ {json_sample}
72
+
73
+ I need a Python function `transform(item)` that modifies this item based on these rules:
74
+ {json.dumps(rules, indent=2)}
75
+
76
+ Requirements:
77
+ 1. The function must take a dict `item` and return the modified dict.
78
+ 2. If the item should be filtered out (dropped), return None.
79
+ 3. Handle missing fields gracefully.
80
+ 4. Return ONLY the Python code for the function. No markdown.
81
+ """
82
+ code = call_llm(prompt, "You are a Python expert.")
83
+ if "```python" in code:
84
+ code = code.split("```python")[1].split("```")[0]
85
+ elif "```" in code:
86
+ code = code.split("```")[1].split("```")[0]
87
+ return code.strip()
88
+
89
+ def generate_full_script(transform_code):
90
+ template = f"""import orjson
91
+ import tqdm
92
+ import argparse
93
+ import sys
94
+
95
+ def transform(item):
96
+ {transform_code}
97
+
98
+ def main():
99
+ parser = argparse.ArgumentParser()
100
+ parser.add_argument('--input', required=True, help='Input JSON/JSONL file')
101
+ parser.add_argument('--output', required=True, help='Output JSONL file')
102
+ args = parser.parse_args()
103
+
104
+ print(f"Processing {{args.input}} -> {{args.output}}")
105
+
106
+ with open(args.input, 'rb') as f_in, open(args.output, 'wb') as f_out:
107
+ # Detect format roughly
108
+ first_char = f_in.read(1)
109
+ f_in.seek(0)
110
+
111
+ is_jsonl = True # Default assumption or logic
112
+
113
+ # Simple line-by-line processing for JSONL
114
+ # For standard JSON list, we'd need ijson or similar for streaming,
115
+ # but for simplicity let's assume JSONL or small JSON.
116
+
117
+ lines = f_in
118
+ if first_char == b'[':
119
+ print("Warning: Standard JSON list detected. Loading full file (memory intensive).")
120
+ data = orjson.loads(f_in.read())
121
+ lines = data
122
+ is_jsonl = False
123
+
124
+ processed_count = 0
125
+ for line in tqdm.tqdm(lines):
126
+ if is_jsonl:
127
+ try:
128
+ item = orjson.loads(line)
129
+ except:
130
+ continue
131
+ else:
132
+ item = line
133
+
134
+ result = transform(item)
135
+
136
+ if result is not None:
137
+ f_out.write(orjson.dumps(result) + b'\\n')
138
+ processed_count += 1
139
+
140
+ print(f"Done. Wrote {{processed_count}} items.")
141
+
142
+ if __name__ == "__main__":
143
+ main()
144
+ """
145
+ return template
146
+
147
+ # --- UI Logic ---
148
+
149
+ def on_analyze(json_text):
150
+ analysis, msg = analyze_json_structure(json_text)
151
+ # Prepare choices for dropdowns
152
+ fields = [item['field'] for item in analysis] if analysis else []
153
+
154
+ # Store analysis in State
155
+ return analysis, gr.update(choices=fields), msg
156
+
157
+ def on_field_select(field, analysis_data):
158
+ # Find suggestions for this field
159
+ suggestions = ["Keep Unchanged", "Delete Field", "Custom"]
160
+ if analysis_data:
161
+ for item in analysis_data:
162
+ if item['field'] == field:
163
+ suggestions += item.get('suggestions', [])
164
+ break
165
+ # Ensure Custom is always available
166
+ if "Custom" not in suggestions:
167
+ suggestions.append("Custom")
168
+ return gr.update(choices=suggestions, value=suggestions[0])
169
+
170
+ def add_rule(field, action, custom, current_rules):
171
+ if not current_rules:
172
+ current_rules = []
173
+
174
+ rule_desc = action
175
+ if action == "Custom":
176
+ rule_desc = f"Custom: {custom}"
177
+
178
+ new_rule = {"field": field, "action": action, "custom": custom, "display": f"{field} -> {rule_desc}"}
179
+ current_rules.append(new_rule)
180
+
181
+ # Return updated dataframe data
182
+ display_data = [[r['field'], r['action'], r['custom']] for r in current_rules]
183
+ return current_rules, display_data
184
+
185
+ def run_preview(json_text, rules):
186
+ if not rules:
187
+ return "No rules defined."
188
+
189
+ # 1. Generate Code
190
+ transform_code = generate_transform_code(json_text, rules)
191
+
192
+ # 2. Execute locally (Safe-ish for this context)
193
+ local_scope = {}
194
+ try:
195
+ exec(transform_code, {}, local_scope)
196
+ transform_func = local_scope.get('transform')
197
+
198
+ if not transform_func:
199
+ return "Error: Could not find 'transform' function in generated code."
200
+
201
+ # Parse input
202
+ try:
203
+ item = json.loads(json_text)
204
+ except:
205
+ item = json.loads(json_text.strip().split('\n')[0])
206
+
207
+ # Run
208
+ result = transform_func(item)
209
+
210
+ return {
211
+ "original": item,
212
+ "modified": result,
213
+ "code": transform_code
214
+ }
215
+ except Exception as e:
216
+ return f"Execution Error: {e}\nCode:\n{transform_code}"
217
+
218
+ def create_ui():
219
+ gr.Markdown("""
220
+ ## ⚡ Fastest JSON Editor (快速 JSON 编辑器)
221
+ Intelligent analysis and modification of JSON/JSONL data using LLM.
222
+ 利用 LLM 智能分析和修改 JSON/JSONL 数据,生成高性能处理脚本。
223
+ """)
224
+
225
+ with gr.Row():
226
+ with gr.Column(scale=1):
227
+ json_input = gr.Textbox(label="Sample JSON Item", lines=10, placeholder="Paste a single JSON object here...")
228
+ analyze_btn = gr.Button("🔍 Analyze Structure")
229
+ status_msg = gr.Markdown("")
230
+
231
+ with gr.Column(scale=1):
232
+ # Field Inspector
233
+ analysis_state = gr.State([])
234
+ rules_state = gr.State([])
235
+
236
+ with gr.Group():
237
+ gr.Markdown("### 🛠️ Add Modification Rule")
238
+ field_dropdown = gr.Dropdown(label="Select Field", choices=[])
239
+ action_dropdown = gr.Dropdown(label="Action", choices=["Keep Unchanged", "Delete Field", "Custom"], allow_custom_value=True)
240
+ custom_input = gr.Textbox(label="Custom Instruction (if needed)", placeholder="e.g. Convert to YYYY-MM-DD")
241
+ add_btn = gr.Button("Add Rule")
242
+
243
+ rules_table = gr.Dataframe(headers=["Field", "Action", "Custom"], label="Active Rules", interactive=False)
244
+
245
+ with gr.Row():
246
+ preview_btn = gr.Button("▶️ Preview & Generate Code", variant="primary")
247
+
248
+ with gr.Row():
249
+ with gr.Column():
250
+ preview_json = gr.JSON(label="Preview Result (Diff)")
251
+ with gr.Column():
252
+ code_output = gr.Code(label="Generated Transform Function", language="python")
253
+
254
+ with gr.Row():
255
+ gen_script_btn = gr.Button("🚀 Generate Full Script")
256
+
257
+ full_script_output = gr.Code(label="Full Production Script", language="python", visible=False)
258
+
259
+ # Event Wiring
260
+ analyze_btn.click(on_analyze, inputs=[json_input], outputs=[analysis_state, field_dropdown, status_msg])
261
+
262
+ field_dropdown.change(on_field_select, inputs=[field_dropdown, analysis_state], outputs=[action_dropdown])
263
+
264
+ add_btn.click(add_rule,
265
+ inputs=[field_dropdown, action_dropdown, custom_input, rules_state],
266
+ outputs=[rules_state, rules_table])
267
+
268
+ preview_btn.click(run_preview,
269
+ inputs=[json_input, rules_state],
270
+ outputs=[preview_json])
271
+
272
+ # Update code output from preview result
273
+ def update_code_view(result):
274
+ if isinstance(result, dict):
275
+ return result.get('code', '')
276
+ return ""
277
+
278
+ preview_btn.click(update_code_view, inputs=[preview_json], outputs=[code_output])
279
+
280
+ def on_gen_script(code):
281
+ return gr.update(visible=True, value=generate_full_script(code))
282
+
283
+ gen_script_btn.click(on_gen_script, inputs=[code_output], outputs=[full_script_output])