Guiyom commited on
Commit
3bddbef
·
verified ·
1 Parent(s): c264689

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +47 -24
app.py CHANGED
@@ -86,6 +86,28 @@ def send_chat_message(user_message, chat_history, report_text, crumbs_text):
86
  updated_history = chat_history + [[user_message, answer]]
87
  return updated_history, ""
88
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
  def fine_tune_report(adjustmentguidelines: str, openai_api_key: str, serpapi_api_key: str, report_html: str,
90
  initial_request: str, qa: str, target_style: str, knowledge_crumbs: str,
91
  complementary_guidance: str) -> (str, str):
@@ -93,12 +115,16 @@ def fine_tune_report(adjustmentguidelines: str, openai_api_key: str, serpapi_api
93
  os.environ["OPENAI_API_KEY"] = openai_api_key
94
  os.environ["SERPAPI_API_KEY"] = serpapi_api_key
95
 
96
- # Parse the existing report HTML.
97
  soup = BeautifulSoup(report_html, "html.parser")
98
 
99
- # --- Specific adjustment: extract only the snippets indicated by the user ---
 
 
 
 
100
  if adjustmentguidelines.strip():
101
- extraction_prompt = f"""You are a technical editor. Review the following report HTML and, based on the specific user instruction below, extract only the precise HTML snippet(s) (including surrounding context if needed) that must be improved.
102
 
103
  User Instruction: "{adjustmentguidelines}"
104
 
@@ -106,7 +132,7 @@ Report HTML:
106
  {report_html}
107
 
108
  Provide a JSON object with a single key "identified_snippets" mapping to an array of HTML snippets that require adjustment.
109
- Do not output any extra commentary or markdown formatting.
110
  """
111
  extraction_result = openai_call(prompt=extraction_prompt, model="o3-mini", max_tokens_param=1500, temperature=0.5)
112
  try:
@@ -117,11 +143,14 @@ Do not output any extra commentary or markdown formatting.
117
  logging.error(f"Error extracting snippets: {e}. Raw result: {extraction_result}")
118
  identified_snippets = []
119
  if identified_snippets:
120
- all_chunks = identified_snippets
121
- # Use the exact provided user instruction as the guideline for every extracted snippet.
122
- all_guidelines = [adjustmentguidelines.strip() for _ in range(len(identified_snippets))]
123
- # Use a default token size (or adjust as needed).
124
- all_token_sizes = [1000] * len(identified_snippets)
 
 
 
125
  else:
126
  logging.info("No specific snippets extracted with the adjustment instruction. Falling back to default global analysis.")
127
  all_chunks = []
@@ -132,15 +161,16 @@ Do not output any extra commentary or markdown formatting.
132
  all_guidelines = []
133
  all_token_sizes = []
134
 
135
- # --- Fallback logic if no specific snippets provided or extracted ---
136
  if not all_chunks:
137
  designated_chunks = soup.find_all("div", class_="improvable-chunk")
138
- global_chunk_prompt = f"""Review the entire report HTML provided below and identify specific sections that should be improved for clarity, consistency, and overall readability at a chunk level. The identified chunks should be distributed across the document in order to enhance alignment with the initial request and the insights from the complementary guidance.
 
139
 
140
- Please provide three pieces of information in a JSON object with exactly three fields (no extra commentary):
141
 
142
  "identified_chunks": An array of HTML snippets representing the chunks to be adjusted.
143
- "chunk_adjustment_guidelines": A list of guideline strings (each with bullet points) specifying the adjustments for each chunk.
144
  "chunk_token_sizes": A list of integers indicating the recommended token size for processing each corresponding chunk.
145
 
146
  Report HTML:
@@ -174,7 +204,6 @@ Knowledge Crumbs (search results):
174
  chunk_adjustment_guidelines_from_llm = []
175
  chunk_token_sizes_from_llm = []
176
 
177
- # Process designated chunks (if any)
178
  designated_chunks_html = []
179
  designated_guidelines = []
180
  designated_token_sizes = []
@@ -184,8 +213,8 @@ Knowledge Crumbs (search results):
184
  designated_prompt = f"""Given the following report chunk:
185
  {chunk_html}
186
 
187
- Generate a JSON object with exactly two fields (no extra commentary):
188
- "guideline": A string with bullet-point guidelines on how to adjust this chunk, ensuring that modifications align with the research query and that citations are updated as needed ([x]).
189
  "token_size": An integer representing the recommended token size for processing this chunk.
190
  """
191
  try:
@@ -200,7 +229,6 @@ Generate a JSON object with exactly two fields (no extra commentary):
200
  designated_guidelines.append("")
201
  designated_token_sizes.append(1000)
202
  designated_chunks_html.append(chunk_html)
203
- # Merge the global LLM results and any designated chunks.
204
  all_chunks = []
205
  all_guidelines = []
206
  all_token_sizes = []
@@ -212,7 +240,6 @@ Generate a JSON object with exactly two fields (no extra commentary):
212
  all_chunks.extend(identified_chunks_from_llm)
213
  all_guidelines.extend(chunk_adjustment_guidelines_from_llm)
214
  all_token_sizes.extend(chunk_token_sizes_from_llm)
215
- # If still nothing, fall back to grouping paragraphs.
216
  if not all_chunks:
217
  all_paragraphs = soup.find_all("p")
218
  group_size = max(1, len(all_paragraphs) // 10)
@@ -230,7 +257,7 @@ Generate a JSON object with exactly two fields (no extra commentary):
230
 
231
  improvements_summary = [] # To store a plain text summary for each processed chunk
232
 
233
- # --- Process each chunk individually ---
234
  for idx, (chunk_html, guideline, token_size) in enumerate(zip(all_chunks, all_guidelines, all_token_sizes), start=1):
235
  chunk_prompt = f"""Improve the following report chunk based on these guidelines:
236
  {guideline}
@@ -247,7 +274,7 @@ Knowledge Crumbs: {knowledge_crumbs}
247
  Complementary Guidance: {complementary_guidance}
248
  Full Report: {report_html}
249
 
250
- Please output a JSON object with exactly two fields (no extra commentary):
251
  {{"improved": "<the improved chunk in valid HTML>", "summary": "<a brief summary of changes>"}}
252
  """
253
  try:
@@ -258,7 +285,6 @@ Please output a JSON object with exactly two fields (no extra commentary):
258
  chunk_summary = chunk_json.get("summary")
259
  if improved_chunk and chunk_summary:
260
  improvements_summary.append(f"Chunk {idx}: {chunk_summary}")
261
- # Replace the original chunk; find the corresponding content in the soup and replace it.
262
  orig = BeautifulSoup(chunk_html, "html.parser")
263
  new_chunk = BeautifulSoup(improved_chunk, "html.parser")
264
  replacement = soup.find(string=lambda text: text and text.strip() in orig.get_text())
@@ -274,12 +300,9 @@ Please output a JSON object with exactly two fields (no extra commentary):
274
  except Exception as e:
275
  logging.error(f"Error processing chunk {idx}: {e}. Raw result: {chunk_result}")
276
 
277
- # Get the updated report HTML as a string.
278
  final_report_html = str(soup)
279
- # Create a plain text summary combining all improvements.
280
  summary_text = "Summary of Fine-Tuning Improvements:\n" + "\n".join(improvements_summary)
281
  global_summary = "Combined Chunk Improvement Guidelines:\n" + "\n".join(all_guidelines)
282
- # Append both summaries (with a separator) to the original Q&A.
283
  updated_qa = qa.strip() + "\n----------\n" + global_summary + "\n" + summary_text
284
 
285
  return final_report_html, updated_qa
 
86
  updated_history = chat_history + [[user_message, answer]]
87
  return updated_history, ""
88
 
89
+ def expand_snippet_area(full_html: str, snippet: str) -> str:
90
+ """
91
+ Given the full HTML and a small snippet (e.g., containing a keyword such as "abc"),
92
+ find the element in which that snippet appears and traverse upward through the DOM
93
+ until a larger container is reached. Allowed container tags include div, table, iframe, and section.
94
+ The traversal stops when the parent is no longer in the allowed list or before reaching <body>.
95
+ Returns the outer HTML of the found container.
96
+ """
97
+ allowed_tags = {"div", "table", "iframe", "section"}
98
+ soup = BeautifulSoup(full_html, "html.parser")
99
+ candidate = soup.find(lambda tag: snippet in tag.get_text() if tag.get_text() else False)
100
+ if candidate:
101
+ current = candidate
102
+ while current.parent is not None and current.parent.name.lower() != "body":
103
+ if current.parent.name.lower() in allowed_tags:
104
+ current = current.parent
105
+ else:
106
+ break
107
+ return str(current)
108
+ else:
109
+ return snippet
110
+
111
  def fine_tune_report(adjustmentguidelines: str, openai_api_key: str, serpapi_api_key: str, report_html: str,
112
  initial_request: str, qa: str, target_style: str, knowledge_crumbs: str,
113
  complementary_guidance: str) -> (str, str):
 
115
  os.environ["OPENAI_API_KEY"] = openai_api_key
116
  os.environ["SERPAPI_API_KEY"] = serpapi_api_key
117
 
118
+ # Parse the entire report HTML.
119
  soup = BeautifulSoup(report_html, "html.parser")
120
 
121
+ # --- Specific adjustment extraction ---
122
+ # When an adjustment instruction (such as "rewrite the visual after 6.1") is provided,
123
+ # ask the LLM to locate every occurrence and extract the HTML snippet that needs adjustment.
124
+ # These snippets might be many (or none), and for each snippet, we then expand the selection
125
+ # to include its outer container (like a div, table, or iframe) for full context.
126
  if adjustmentguidelines.strip():
127
+ extraction_prompt = f"""You are a technical editor. Review the following report HTML and, based on the specific user instruction below, extract only the precise HTML snippet(s) (including any meaningful surrounding context) that must be improved.
128
 
129
  User Instruction: "{adjustmentguidelines}"
130
 
 
132
  {report_html}
133
 
134
  Provide a JSON object with a single key "identified_snippets" mapping to an array of HTML snippets that require adjustment.
135
+ Do not include any additional commentary or markdown formatting.
136
  """
137
  extraction_result = openai_call(prompt=extraction_prompt, model="o3-mini", max_tokens_param=1500, temperature=0.5)
138
  try:
 
143
  logging.error(f"Error extracting snippets: {e}. Raw result: {extraction_result}")
144
  identified_snippets = []
145
  if identified_snippets:
146
+ expanded_snippets = []
147
+ # Process EVERY occurrence in the returned array.
148
+ for snippet in identified_snippets:
149
+ expanded = expand_snippet_area(report_html, snippet)
150
+ expanded_snippets.append(expanded)
151
+ all_chunks = expanded_snippets
152
+ all_guidelines = [adjustmentguidelines.strip() for _ in range(len(expanded_snippets))]
153
+ all_token_sizes = [1000] * len(expanded_snippets)
154
  else:
155
  logging.info("No specific snippets extracted with the adjustment instruction. Falling back to default global analysis.")
156
  all_chunks = []
 
161
  all_guidelines = []
162
  all_token_sizes = []
163
 
164
+ # --- Fallback global analysis if no specific snippets were extracted ---
165
  if not all_chunks:
166
  designated_chunks = soup.find_all("div", class_="improvable-chunk")
167
+ global_chunk_prompt = f"""Review the entire report HTML provided below and identify specific sections that should be improved for clarity, consistency, and overall readability.
168
+ The identified chunks should be distributed across the document in order to enhance alignment with the initial request and complementary guidance.
169
 
170
+ Please provide a JSON object with exactly three keys (without additional commentary):
171
 
172
  "identified_chunks": An array of HTML snippets representing the chunks to be adjusted.
173
+ "chunk_adjustment_guidelines": A list of guideline strings (each with bullet points) for each chunk.
174
  "chunk_token_sizes": A list of integers indicating the recommended token size for processing each corresponding chunk.
175
 
176
  Report HTML:
 
204
  chunk_adjustment_guidelines_from_llm = []
205
  chunk_token_sizes_from_llm = []
206
 
 
207
  designated_chunks_html = []
208
  designated_guidelines = []
209
  designated_token_sizes = []
 
213
  designated_prompt = f"""Given the following report chunk:
214
  {chunk_html}
215
 
216
+ Generate a JSON object with exactly two keys (no extra commentary):
217
+ "guideline": A string with bullet-point guidelines on how to adjust this chunk, ensuring modifications align with the research query and that citations are updated ([x]).
218
  "token_size": An integer representing the recommended token size for processing this chunk.
219
  """
220
  try:
 
229
  designated_guidelines.append("")
230
  designated_token_sizes.append(1000)
231
  designated_chunks_html.append(chunk_html)
 
232
  all_chunks = []
233
  all_guidelines = []
234
  all_token_sizes = []
 
240
  all_chunks.extend(identified_chunks_from_llm)
241
  all_guidelines.extend(chunk_adjustment_guidelines_from_llm)
242
  all_token_sizes.extend(chunk_token_sizes_from_llm)
 
243
  if not all_chunks:
244
  all_paragraphs = soup.find_all("p")
245
  group_size = max(1, len(all_paragraphs) // 10)
 
257
 
258
  improvements_summary = [] # To store a plain text summary for each processed chunk
259
 
260
+ # --- Process each chunk ---
261
  for idx, (chunk_html, guideline, token_size) in enumerate(zip(all_chunks, all_guidelines, all_token_sizes), start=1):
262
  chunk_prompt = f"""Improve the following report chunk based on these guidelines:
263
  {guideline}
 
274
  Complementary Guidance: {complementary_guidance}
275
  Full Report: {report_html}
276
 
277
+ Please output a JSON object with exactly two keys (no extra commentary):
278
  {{"improved": "<the improved chunk in valid HTML>", "summary": "<a brief summary of changes>"}}
279
  """
280
  try:
 
285
  chunk_summary = chunk_json.get("summary")
286
  if improved_chunk and chunk_summary:
287
  improvements_summary.append(f"Chunk {idx}: {chunk_summary}")
 
288
  orig = BeautifulSoup(chunk_html, "html.parser")
289
  new_chunk = BeautifulSoup(improved_chunk, "html.parser")
290
  replacement = soup.find(string=lambda text: text and text.strip() in orig.get_text())
 
300
  except Exception as e:
301
  logging.error(f"Error processing chunk {idx}: {e}. Raw result: {chunk_result}")
302
 
 
303
  final_report_html = str(soup)
 
304
  summary_text = "Summary of Fine-Tuning Improvements:\n" + "\n".join(improvements_summary)
305
  global_summary = "Combined Chunk Improvement Guidelines:\n" + "\n".join(all_guidelines)
 
306
  updated_qa = qa.strip() + "\n----------\n" + global_summary + "\n" + summary_text
307
 
308
  return final_report_html, updated_qa