internationalscholarsprogram commited on
Commit
c9990de
·
1 Parent(s): 928c304

Add DB compare and update logic

Browse files
Files changed (2) hide show
  1. app.py +198 -20
  2. requirements.txt +1 -1
app.py CHANGED
@@ -1,36 +1,214 @@
1
- import gradio as gr
2
  import json
 
 
3
  from docx import Document # from python-docx
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
 
5
- def docx_to_json(file_obj):
 
 
 
6
  if file_obj is None:
7
- return "No file uploaded", "{}"
8
-
9
- # Load docx
10
  document = Document(file_obj.name)
11
-
12
- # Very simple example: collect paragraphs as a list
13
  paragraphs = [p.text for p in document.paragraphs if p.text.strip() != ""]
14
-
15
- # Build a JSON structure (customize to your needs)
16
  data = {
17
  "paragraphs": paragraphs,
18
  "paragraph_count": len(paragraphs),
19
  }
20
-
21
- return json.dumps(data, indent=2)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  with gr.Blocks() as demo:
24
- gr.Markdown("# DOCX → JSON & DB Sync (Prototype)")
25
-
 
 
 
 
 
26
  with gr.Row():
27
- file_input = gr.File(label="Upload Word (.docx)")
28
-
29
- json_output = gr.Code(label="Extracted JSON", language="json")
30
-
31
- btn = gr.Button("Convert to JSON")
32
-
33
- btn.click(fn=docx_to_json, inputs=file_input, outputs=json_output)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
 
35
  if __name__ == "__main__":
36
  demo.launch()
 
1
+ import os
2
  import json
3
+
4
+ import gradio as gr
5
  from docx import Document # from python-docx
6
+ from deepdiff import DeepDiff
7
+ import mysql.connector
8
+
9
+
10
+ # -----------------------------
11
+ # DB CONNECTION HELPERS
12
+ # -----------------------------
13
+ def get_db_connection():
14
+ """
15
+ Create and return a MySQL connection using environment variables.
16
+ Set these in your HF Space settings:
17
+ DB_HOST, DB_PORT, DB_USER, DB_PASSWORD, DB_NAME
18
+ """
19
+ return mysql.connector.connect(
20
+ host=os.getenv("DB_HOST", "localhost"),
21
+ port=int(os.getenv("DB_PORT", "3306")),
22
+ user=os.getenv("DB_USER", "root"),
23
+ password=os.getenv("DB_PASSWORD", ""),
24
+ database=os.getenv("DB_NAME", "test"),
25
+ )
26
+
27
+
28
+ def fetch_db_json(doc_id: str):
29
+ """
30
+ Fetch existing JSON from the database for a given doc_id.
31
+ Assumes a table 'documents' with columns: id, json_data.
32
+ """
33
+ conn = get_db_connection()
34
+ try:
35
+ cursor = conn.cursor()
36
+ query = "SELECT json_data FROM documents WHERE id = %s"
37
+ cursor.execute(query, (doc_id,))
38
+ row = cursor.fetchone()
39
+ if not row or row[0] is None:
40
+ return None
41
+ # If stored as TEXT, parse it as JSON.
42
+ return json.loads(row[0])
43
+ finally:
44
+ cursor.close()
45
+ conn.close()
46
+
47
+
48
+ def update_db_json(doc_id: str, new_data: dict):
49
+ """
50
+ Update JSON content in the database for a given doc_id.
51
+ """
52
+ conn = get_db_connection()
53
+ try:
54
+ cursor = conn.cursor()
55
+ new_json_str = json.dumps(new_data, ensure_ascii=False)
56
+ query = "UPDATE documents SET json_data = %s WHERE id = %s"
57
+ cursor.execute(query, (new_json_str, doc_id))
58
+ conn.commit()
59
+ finally:
60
+ cursor.close()
61
+ conn.close()
62
+
63
 
64
+ # -----------------------------
65
+ # DOCX → JSON
66
+ # -----------------------------
67
+ def docx_to_python_dict(file_obj):
68
  if file_obj is None:
69
+ raise ValueError("No file uploaded")
70
+
 
71
  document = Document(file_obj.name)
72
+
73
+ # Example: convert paragraphs into a simple structured dict
74
  paragraphs = [p.text for p in document.paragraphs if p.text.strip() != ""]
75
+
 
76
  data = {
77
  "paragraphs": paragraphs,
78
  "paragraph_count": len(paragraphs),
79
  }
80
+ return data
81
+
82
+
83
+ # -----------------------------
84
+ # GRADIO CALLBACKS
85
+ # -----------------------------
86
+ def convert_and_compare(file_obj, doc_id):
87
+ """
88
+ 1. Convert DOCX to JSON (Python dict)
89
+ 2. Fetch old JSON from DB
90
+ 3. Compare and return:
91
+ - new_json_str
92
+ - old_json_str (or message if none)
93
+ - diff_str
94
+ """
95
+ if file_obj is None:
96
+ return "{}", "No existing record (or doc_id missing)", "No file uploaded."
97
+
98
+ if not doc_id:
99
+ return "{}", "{}", "Please provide a doc_id to look up in the database."
100
+
101
+ # 1) DOCX → dict
102
+ try:
103
+ new_data = docx_to_python_dict(file_obj)
104
+ except Exception as e:
105
+ return "{}", "{}", f"Error parsing DOCX: {e}"
106
+
107
+ new_json_str = json.dumps(new_data, indent=2, ensure_ascii=False)
108
+
109
+ # 2) Fetch existing from DB
110
+ try:
111
+ old_data = fetch_db_json(doc_id)
112
+ except Exception as e:
113
+ return new_json_str, "{}", f"Error fetching from DB: {e}"
114
 
115
+ if old_data is None:
116
+ old_json_str = "No existing JSON found for this doc_id."
117
+ diff_str = "No existing data to compare. You can choose to update DB with this new JSON."
118
+ return new_json_str, old_json_str, diff_str
119
+
120
+ old_json_str = json.dumps(old_data, indent=2, ensure_ascii=False)
121
+
122
+ # 3) Compare with DeepDiff
123
+ try:
124
+ diff = DeepDiff(old_data, new_data, ignore_order=True)
125
+ if not diff:
126
+ diff_str = "No differences detected between DOCX JSON and DB JSON."
127
+ else:
128
+ diff_str = json.dumps(diff, indent=2, ensure_ascii=False, default=str)
129
+ except Exception as e:
130
+ diff_str = f"Error computing diff: {e}"
131
+
132
+ return new_json_str, old_json_str, diff_str
133
+
134
+
135
+ def apply_update(doc_id, new_json_str):
136
+ """
137
+ Apply the new JSON to the DB if user confirms.
138
+ """
139
+ if not doc_id:
140
+ return "Please provide a doc_id."
141
+
142
+ if not new_json_str.strip():
143
+ return "No new JSON provided to update."
144
+
145
+ try:
146
+ new_data = json.loads(new_json_str)
147
+ except Exception as e:
148
+ return f"Error parsing new JSON: {e}"
149
+
150
+ try:
151
+ update_db_json(doc_id, new_data)
152
+ except Exception as e:
153
+ return f"Error updating DB: {e}"
154
+
155
+ return "Database updated successfully with new JSON."
156
+
157
+
158
+ # -----------------------------
159
+ # GRADIO UI
160
+ # -----------------------------
161
  with gr.Blocks() as demo:
162
+ gr.Markdown("# DOCX → JSON DB Sync")
163
+ gr.Markdown(
164
+ "Upload a Word (.docx) file, enter the document ID from your database, "
165
+ "and compare the generated JSON with what is stored in the DB. "
166
+ "If there are changes, you can update the DB."
167
+ )
168
+
169
  with gr.Row():
170
+ file_input = gr.File(label="Upload .docx file")
171
+ doc_id_input = gr.Textbox(label="Document ID (as stored in DB)", placeholder="e.g. 123")
172
+
173
+ with gr.Row():
174
+ new_json_output = gr.Code(
175
+ label="New JSON (from DOCX)",
176
+ language="json",
177
+ interactive=True,
178
+ )
179
+ old_json_output = gr.Code(
180
+ label="Existing JSON (from DB)",
181
+ language="json",
182
+ interactive=False,
183
+ )
184
+
185
+ diff_output = gr.Code(
186
+ label="Diff (DeepDiff result)",
187
+ language="json",
188
+ interactive=False,
189
+ )
190
+
191
+ compare_button = gr.Button("Convert & Compare")
192
+ compare_button.click(
193
+ fn=convert_and_compare,
194
+ inputs=[file_input, doc_id_input],
195
+ outputs=[new_json_output, old_json_output, diff_output],
196
+ )
197
+
198
+ gr.Markdown("## Apply Update")
199
+ gr.Markdown(
200
+ "If you're happy with the changes, click below to write the **New JSON** "
201
+ "back into the database for this `doc_id`."
202
+ )
203
+
204
+ update_status = gr.Textbox(label="Update Status", interactive=False)
205
+
206
+ update_button = gr.Button("Update DB with New JSON")
207
+ update_button.click(
208
+ fn=apply_update,
209
+ inputs=[doc_id_input, new_json_output],
210
+ outputs=[update_status],
211
+ )
212
 
213
  if __name__ == "__main__":
214
  demo.launch()
requirements.txt CHANGED
@@ -1,4 +1,4 @@
1
  gradio
2
  python-docx
3
  deepdiff
4
- mysql-connector-python # or psycopg2 / sqlalchemy depending on your DB
 
1
  gradio
2
  python-docx
3
  deepdiff
4
+ mysql-connector-python