Guiyom commited on
Commit
db9a2da
·
verified ·
1 Parent(s): f31625d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +119 -201
app.py CHANGED
@@ -158,229 +158,147 @@ Chunk to be replaced:
158
  unique_snippet = find_best_matching_snippet(chunk_html, report_html)
159
  return unique_snippet
160
 
161
- def fine_tune_report(adjustmentguidelines: str, openai_api_key: str, serpapi_api_key: str, report_html: str,
162
  initial_request: str, qa: str, target_style: str, knowledge_crumbs: str,
163
  complementary_guidance: str) -> (str, str):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
164
  import json
165
  import logging
166
- import os
167
  from bs4 import BeautifulSoup
168
-
169
- # Set API keys in environment variables.
170
  os.environ["OPENAI_API_KEY"] = openai_api_key
171
  os.environ["SERPAPI_API_KEY"] = serpapi_api_key
172
 
173
- # Parse the original report HTML.
174
- soup = BeautifulSoup(report_html, "html.parser")
175
- updated_report_html = report_html # working copy
176
-
177
- # --- Specific adjustment extraction ---
178
- if adjustmentguidelines.strip():
179
- extraction_prompt = f"""You are a technical editor. Review the following report HTML and, based on the specific user instruction below,
180
- extract only the precise HTML snippet(s) (including any meaningful surrounding context) that must be improved.
181
-
182
- User Instruction: "{adjustmentguidelines}"
183
-
184
- Report HTML:
185
- {report_html}
186
-
187
- Provide a JSON object with a single key "identified_snippets" mapping to an array of HTML snippets that require adjustment.
188
- Do not include any additional commentary or markdown formatting.
189
- """
190
- extraction_result = openai_call(prompt=extraction_prompt, model="o3-mini", max_tokens_param=1500, temperature=0.5)
191
- try:
192
- extraction_result = extraction_result.strip().strip("```")
193
- extraction_json = json.loads(extraction_result)
194
- identified_snippets = extraction_json.get("identified_snippets", [])
195
- except Exception as e:
196
- logging.error(f"Error extracting snippets: {e}. Raw result: {extraction_result}")
197
- identified_snippets = []
198
- if identified_snippets:
199
- expanded_snippets = []
200
- for snippet in identified_snippets:
201
- expanded = expand_snippet_area(report_html, snippet)
202
- expanded_snippets.append(expanded)
203
- all_chunks = expanded_snippets
204
- all_guidelines = [adjustmentguidelines.strip() for _ in range(len(expanded_snippets))]
205
- all_token_sizes = [1000] * len(expanded_snippets)
206
- else:
207
- logging.info("No specific snippets extracted with the adjustment instruction. Falling back to default global analysis.")
208
- all_chunks = []
209
- all_guidelines = []
210
- all_token_sizes = []
211
- else:
212
- all_chunks = []
213
- all_guidelines = []
214
- all_token_sizes = []
215
-
216
- # --- Fallback global analysis if no specific snippets were extracted ---
217
- if not all_chunks:
218
- designated_chunks = soup.find_all("div", class_="improvable-chunk")
219
- global_chunk_prompt = f"""Review the entire report HTML provided below and identify specific sections that should be improved for clarity, consistency, and overall readability.
220
- The identified chunks should be distributed across the document in order to enhance alignment with the initial request and complementary guidance.
221
-
222
- Please provide a JSON object with exactly three keys (without additional commentary):
223
-
224
- "identified_chunks": An array of HTML snippets representing the chunks to be adjusted.
225
- "chunk_adjustment_guidelines": A list of guideline strings (each with bullet points) for each chunk.
226
- "chunk_token_sizes": A list of integers indicating the recommended token size for processing each corresponding chunk.
227
-
228
- Report HTML:
229
- {report_html}
230
-
231
- Initial Request:
232
- {initial_request}
233
-
234
- Complementary Guidance:
235
- {complementary_guidance}
236
-
237
- Clarification Q&A:
238
- {qa}
239
 
240
- Target Style:
241
- {target_style}
242
-
243
- Knowledge Crumbs (search results):
244
- {knowledge_crumbs}
245
- """
246
- try:
247
- global_result = openai_call(prompt=global_chunk_prompt, model="o3-mini", max_tokens_param=4000, temperature=0.5)
248
- global_result = global_result.strip().strip("```")
249
- global_json = json.loads(global_result)
250
- identified_chunks_from_llm = global_json.get("identified_chunks", [])
251
- chunk_adjustment_guidelines_from_llm = global_json.get("chunk_adjustment_guidelines", [])
252
- chunk_token_sizes_from_llm = global_json.get("chunk_token_sizes", [])
253
- except Exception as e:
254
- logging.error(f"Error during global analysis: {e}. Raw result: {global_result}")
255
- identified_chunks_from_llm = []
256
- chunk_adjustment_guidelines_from_llm = []
257
- chunk_token_sizes_from_llm = []
258
- designated_chunks_html = []
259
- designated_guidelines = []
260
- designated_token_sizes = []
261
- if designated_chunks:
262
- for chunk in designated_chunks:
263
- chunk_html = str(chunk)
264
- designated_prompt = f"""Given the following report chunk:
265
- {chunk_html}
266
-
267
- Generate a JSON object with exactly two keys (no extra commentary):
268
- "guideline": A string with bullet-point guidelines on how to adjust this chunk, ensuring modifications align with the research query and that citations are updated ([x]).
269
- "token_size": An integer representing the recommended token size for processing this chunk.
270
- """
271
- try:
272
- result = openai_call(prompt=designated_prompt, model="o3-mini", max_tokens_param=500, temperature=0.5)
273
- result = result.strip().strip("```")
274
- result_json = json.loads(result)
275
- designated_guidelines.append(result_json.get("guideline", ""))
276
- designated_token_sizes.append(result_json.get("token_size", 1000))
277
- designated_chunks_html.append(chunk_html)
278
- except Exception as e:
279
- logging.error(f"Error processing a designated chunk: {e}. Raw result: {result}")
280
- designated_guidelines.append("")
281
- designated_token_sizes.append(1000)
282
- designated_chunks_html.append(chunk_html)
283
- # Reset all_chunks, guidelines and token sizes
284
- all_chunks = []
285
- all_guidelines = []
286
- all_token_sizes = []
287
- if designated_chunks_html:
288
- all_chunks.extend(designated_chunks_html)
289
- all_guidelines.extend(designated_guidelines)
290
- all_token_sizes.extend(designated_token_sizes)
291
- if identified_chunks_from_llm and isinstance(identified_chunks_from_llm, list):
292
- all_chunks.extend(identified_chunks_from_llm)
293
- all_guidelines.extend(chunk_adjustment_guidelines_from_llm)
294
- all_token_sizes.extend(chunk_token_sizes_from_llm)
295
- if not all_chunks:
296
- all_paragraphs = soup.find_all("p")
297
- group_size = max(1, len(all_paragraphs) // 10)
298
- for i in range(0, len(all_paragraphs), group_size):
299
- new_div = soup.new_tag("div", **{"class": "improvable-chunk"})
300
- for p in all_paragraphs[i:i+group_size]:
301
- new_div.append(p.extract())
302
- if soup.body:
303
- soup.body.append(new_div)
304
- else:
305
- soup.append(new_div)
306
- all_chunks.append(str(new_div))
307
- all_guidelines.append("Improve clarity and conciseness; ensure consistency regarding citations ([x]).")
308
- all_token_sizes.append(1000)
309
-
310
- improvements_summary = []
311
-
312
- # --- Process each chunk with robust DOM-based replacement ---
313
- for idx, (chunk_html, guideline, token_size) in enumerate(zip(all_chunks, all_guidelines, all_token_sizes), start=1):
314
- chunk_prompt = f"""Improve the following report chunk based on these guidelines:
315
- {guideline}
316
- Use a maximum of {token_size} tokens to generate the improved content.
317
- IMPORTANT: Only modify parts that require improvement. If no changes are necessary, return the original content unchanged.
318
- Additionally, ensure that the improved content includes concrete real-world examples—such as persons with names and titles, company names, institution names, research report titles, quotes, products, and use-case examples—complete with proper inline citations ([x]) as sourced.
319
-
320
- --- Chunk #{idx} Original Content ---
321
- {chunk_html}
322
-
323
- Initial Request: {initial_request}
324
- Clarification Q&A: {qa}
325
- Target Style: {target_style}
326
- Knowledge Crumbs: {knowledge_crumbs}
327
- Complementary Guidance: {complementary_guidance}
328
- Full Report: {report_html}
329
 
330
- Please output a JSON object with exactly two keys (no extra commentary):
331
- {{"improved": "<the improved chunk in valid HTML>", "summary": "<a brief summary of changes>"}}
332
- """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
333
  try:
334
- chunk_result = openai_call(prompt=chunk_prompt, model="o3-mini", max_tokens_param=token_size+1500)
335
- chunk_result = chunk_result.strip().strip("```")
336
- chunk_json = json.loads(chunk_result)
337
- improved_chunk = chunk_json.get("improved")
338
- chunk_summary = chunk_json.get("summary")
339
- if improved_chunk and chunk_summary:
340
- improvements_summary.append(f"Chunk {idx}: {chunk_summary}")
341
- # Determine a minimal unique snippet for the current chunk.
342
- unique_snippet = get_unique_snippet(chunk_html, report_html)
343
- improved_chunk_clean = improved_chunk.strip()
344
- if unique_snippet and unique_snippet in updated_report_html:
345
- updated_report_html = updated_report_html.replace(unique_snippet, improved_chunk_clean, 1)
346
- else:
347
- logging.warning(f"Chunk {idx}: Unable to locate the unique snippet ({unique_snippet}). Replacement not applied.")
348
- else:
349
- logging.error(f"Chunk {idx}: Incomplete JSON result: {chunk_result}")
350
  except Exception as e:
351
- logging.error(f"Error processing chunk {idx}: {e}. Raw result: {chunk_result}")
352
-
353
- # --- Post-process the report to update the reference table without appending extra content ---
354
- references_prompt = f"""Review the following report HTML.
355
- If any new inline citations (formatted as [x] where x is a number) have been introduced
356
- that are not yet included in the references table at the end of the report,
357
- generate an updated reference summary table that includes all citations.
358
- Each reference entry must include: reference number, name, author(s), and full URL.
359
- Output only the HTML code for the updated reference table.
360
- Report HTML:
361
- {updated_report_html}
362
- """
363
- updated_references = openai_call(prompt=references_prompt, model="o3-mini", max_tokens_param=1000, temperature=0.5)
364
- updated_references = updated_references.strip().strip("```")
 
 
 
 
 
 
365
 
366
- if updated_references:
367
  soup_updated = BeautifulSoup(updated_report_html, "html.parser")
368
- ref_heading = soup_updated.find(lambda tag: tag.name == "h1" and "Reference Summary Table" in tag.get_text())
 
369
  if ref_heading:
370
  next_sibling = ref_heading.find_next_sibling()
371
  if next_sibling:
372
- new_ref_html = BeautifulSoup(updated_references, "html.parser")
373
- next_sibling.replace_with(new_ref_html)
 
 
 
 
 
 
374
  updated_report_html = str(soup_updated)
375
  else:
376
- logging.info("No existing reference table found; reference update skipped.")
377
  else:
378
- logging.info("Generated updated references empty; leaving original references unchanged.")
379
 
380
- global_summary = "Combined Chunk Improvement Guidelines:\n" + "\n".join(all_guidelines)
381
- summary_text = "Summary of Fine-Tuning Improvements:\n" + "\n".join(improvements_summary)
382
- updated_qa = qa.strip() + "\n----------\n" + global_summary + "\n" + summary_text
383
 
 
384
  return updated_report_html, updated_qa
385
 
386
  def generate_graph_snippet(placeholder_text: str, context: str, initial_query: str, crumbs: str) -> str:
 
158
  unique_snippet = find_best_matching_snippet(chunk_html, report_html)
159
  return unique_snippet
160
 
161
+ def fine_tune_report(adjustment_request: str, openai_api_key: str, serpapi_api_key: str, report_html: str,
162
  initial_request: str, qa: str, target_style: str, knowledge_crumbs: str,
163
  complementary_guidance: str) -> (str, str):
164
+ """
165
+ Fine-tunes an HTML report based on a user’s correction request.
166
+
167
+ Steps:
168
+ 1. Identify relevant snippet(s) from the report that need adjustment by calling the LLM.
169
+ 2. Using BeautifulSoup, find those snippet(s) in report_html.
170
+ 3. For each snippet, call the LLM to generate a corrected version given the user request,
171
+ keeping in mind the full report context and search crumbs.
172
+ 4. Replace the old snippet in the report with the corrected one.
173
+ 5. Call the LLM to review the updated report and generate an updated reference table (if new references exist).
174
+ 6. Return the updated report and append a summary of applied corrections to the QA log.
175
+
176
+ Parameters:
177
+ adjustment_request: The user request for corrections (e.g. "fix the visual after 'xyz'").
178
+ openai_api_key: OpenAI API Key.
179
+ serpapi_api_key: SERPAPI API Key.
180
+ report_html: The full HTML of the current report.
181
+ initial_request: The original research query/original request.
182
+ qa: Existing clarification Q&A.
183
+ target_style: The target style for the report.
184
+ knowledge_crumbs: Aggregated source/crumb content.
185
+ complementary_guidance: Any additional guidance.
186
+
187
+ Returns:
188
+ A tuple (updated_report_html, updated_qa)
189
+ """
190
+ import os
191
  import json
192
  import logging
 
193
  from bs4 import BeautifulSoup
194
+
195
+ # Set API keys in environment variables
196
  os.environ["OPENAI_API_KEY"] = openai_api_key
197
  os.environ["SERPAPI_API_KEY"] = serpapi_api_key
198
 
199
+ logging.info("fine_tune_report: Starting fine-tuning process based on the adjustment request.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
200
 
201
+ # Step 1: Identify the snippet(s) in the report relevant to the adjustment.
202
+ prompt_identify = (f"You are a meticulous technical editor. Below is the full report HTML and a user adjustment request. "
203
+ f"Based on the user instruction, extract and output the minimal, unique HTML snippet(s) (including their container tags) "
204
+ f"from the report that need fixing. Output your answer as a JSON object with a key \"identified_snippets\" mapping to a list of HTML snippets only (no commentary).\n\n"
205
+ f"Full Report HTML:\n{report_html}\n\n"
206
+ f"User Adjustment Request:\n{adjustment_request}\n\n"
207
+ f"Only output valid JSON.")
208
+
209
+ response_identify = openai_call(prompt=prompt_identify, model="o3-mini", max_tokens_param=1500, temperature=0)
210
+ logging.info(f"fine_tune_report: Raw snippet identification response: {response_identify}")
211
+ try:
212
+ response_identify = response_identify.strip().strip("```")
213
+ id_data = json.loads(response_identify)
214
+ identified_snippets = id_data.get("identified_snippets", [])
215
+ except Exception as e:
216
+ logging.error(f"fine_tune_report: Error parsing identified snippets JSON: {e}")
217
+ identified_snippets = []
218
+
219
+ # If no snippets were identified, log an error and fall back (optional: you may choose to return without changes).
220
+ if not identified_snippets:
221
+ logging.warning("fine_tune_report: No specific snippets were identified for adjustment. Returning original report.")
222
+ return report_html, qa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
223
 
224
+ # Step 2: For each identified snippet, extract it from the report and prepare to correct it.
225
+ soup = BeautifulSoup(report_html, "html.parser")
226
+ updated_report_html = report_html
227
+ corrections_summary = []
228
+
229
+ for snippet in identified_snippets:
230
+ snippet = snippet.strip()
231
+ # Check if the snippet text appears in the report
232
+ if snippet not in updated_report_html:
233
+ logging.warning(f"fine_tune_report: The following snippet was not found exactly in the report and will be skipped:\n{snippet}")
234
+ continue
235
+
236
+ # Step 3: For each snippet, prompt the LLM to apply the user-specified correction.
237
+ prompt_adjust = (f"You are a technical editor. Given the following HTML snippet extracted from a larger report and the user request, "
238
+ f"make only the changes necessary to address the instruction. Preserve all existing citations, formatting, and context. "
239
+ f"Ensure that the overall style of the report remains consistent with the provided target style and that any new references (if any) "
240
+ f"are clearly indicated. Output your answer as a JSON object with two keys: \"improved\" (the corrected HTML snippet) and \"summary\" "
241
+ f"(a brief summary of the changes applied).\n\n"
242
+ f"Overall Report HTML:\n{report_html}\n\n"
243
+ f"Current Snippet to Adjust:\n{snippet}\n\n"
244
+ f"User Adjustment Request:\n{adjustment_request}\n\n"
245
+ f"Additional Guidance:\nTarget Style: {target_style}\nKnowledge Crumbs: {knowledge_crumbs}\nComplementary Guidance: {complementary_guidance}\n\n"
246
+ f"Only output valid JSON.")
247
+ response_adjust = openai_call(prompt=prompt_adjust, model="o3-mini", max_tokens_param=2000, temperature=0.0)
248
+ logging.info(f"fine_tune_report: Raw adjustment response: {response_adjust}")
249
  try:
250
+ response_adjust = response_adjust.strip().strip("```")
251
+ adjust_data = json.loads(response_adjust)
252
+ corrected_snippet = adjust_data.get("improved", "").strip()
253
+ snippet_summary = adjust_data.get("summary", "").strip()
 
 
 
 
 
 
 
 
 
 
 
 
254
  except Exception as e:
255
+ logging.error(f"fine_tune_report: Error parsing snippet adjustment JSON: {e}")
256
+ continue
257
+
258
+ if not corrected_snippet:
259
+ logging.warning("fine_tune_report: No improved snippet was returned by the LLM; skipping this snippet.")
260
+ continue
261
+
262
+ corrections_summary.append(f"Changes applied to snippet: {snippet_summary}")
263
+ # Step 4: Replace the original snippet with the improved snippet in the report HTML.
264
+ updated_report_html = updated_report_html.replace(snippet, corrected_snippet, 1)
265
+ logging.info("fine_tune_report: Snippet replaced in the report.")
266
+
267
+ # Step 5: Update the reference table. Ask the LLM to review the updated report and generate an updated reference table if needed.
268
+ prompt_refs = (f"You are a technical editor. Review the following updated report HTML. "
269
+ f"If there are any new inline citations (formatted as [x]) that are not in the existing reference table, "
270
+ f"generate an updated Reference Summary Table in valid HTML that includes all references. "
271
+ f"Output only the HTML code for the updated reference table without any extra commentary.\n\n"
272
+ f"Updated Report HTML:\n{updated_report_html}")
273
+ updated_refs = openai_call(prompt=prompt_refs, model="o3-mini", max_tokens_param=1000, temperature=0.5)
274
+ updated_refs = updated_refs.strip().strip("```")
275
 
276
+ if updated_refs:
277
  soup_updated = BeautifulSoup(updated_report_html, "html.parser")
278
+ # Look for a heading that includes "Reference Summary Table"
279
+ ref_heading = soup_updated.find(lambda tag: tag.name in ["h1", "h2", "h3", "h4"] and "Reference Summary Table" in tag.get_text())
280
  if ref_heading:
281
  next_sibling = ref_heading.find_next_sibling()
282
  if next_sibling:
283
+ try:
284
+ new_ref_html = BeautifulSoup(updated_refs, "html.parser")
285
+ next_sibling.replace_with(new_ref_html)
286
+ logging.info("fine_tune_report: Reference table updated successfully.")
287
+ except Exception as e:
288
+ logging.error(f"fine_tune_report: Error replacing the reference table: {e}")
289
+ else:
290
+ logging.info("fine_tune_report: No sibling element found after the reference heading; skipping reference table update.")
291
  updated_report_html = str(soup_updated)
292
  else:
293
+ logging.info("fine_tune_report: No existing reference table heading found; reference update skipped.")
294
  else:
295
+ logging.info("fine_tune_report: LLM did not return an updated reference table; leaving original references intact.")
296
 
297
+ # Step 6: Append corrections summary to the Q&A log.
298
+ global_summary = "Corrections Applied Based on User Request:\n" + "\n".join(corrections_summary)
299
+ updated_qa = qa.strip() + "\n----------\n" + global_summary
300
 
301
+ logging.info("fine_tune_report: Fine-tuning process completed.")
302
  return updated_report_html, updated_qa
303
 
304
  def generate_graph_snippet(placeholder_text: str, context: str, initial_query: str, crumbs: str) -> str: