DocUA commited on
Commit
59fabbc
·
1 Parent(s): d814828

Add JSON extraction utility and enhance analysis methods for better error handling

Browse files
Files changed (2) hide show
  1. main.py +107 -132
  2. utils.py +43 -1
main.py CHANGED
@@ -44,7 +44,8 @@ from utils import (
44
  clean_text,
45
  extract_court_decision_text,
46
  get_links_html,
47
- get_links_html_lp
 
48
  )
49
  from embeddings import GeminiEmbedding
50
 
@@ -300,12 +301,16 @@ class LLMAnalyzer:
300
  response_format=response_format,
301
  temperature=0
302
  )
303
- return response.choices[0].message.content
 
 
 
 
304
  except Exception as e:
305
  raise RuntimeError(f"Error in OpenAI analysis: {str(e)}")
306
 
307
  async def _analyze_with_deepseek(self, prompt: str) -> str:
308
- """Analyze text using OpenAI."""
309
  messages = [
310
  ChatMessage(role="system", content=SYSTEM_PROMPT),
311
  ChatMessage(role="user", content=prompt)
@@ -322,7 +327,11 @@ class LLMAnalyzer:
322
  response_format=response_format,
323
  temperature=0
324
  )
325
- return response.choices[0].message.content
 
 
 
 
326
  except Exception as e:
327
  raise RuntimeError(f"Error in DeepSeek analysis: {str(e)}")
328
 
@@ -335,7 +344,13 @@ class LLMAnalyzer:
335
  system=SYSTEM_PROMPT,
336
  messages=[{"role": "user", "content": prompt}]
337
  )
338
- return response.content[0].text
 
 
 
 
 
 
339
  except Exception as e:
340
  raise RuntimeError(f"Error in Anthropic analysis: {str(e)}")
341
 
@@ -386,48 +401,29 @@ class LLMAnalyzer:
386
  if not response_text:
387
  raise RuntimeError("Empty response from Gemini")
388
 
389
- # Витягуємо JSON з відповіді
390
- text = response_text.strip()
391
- # Знаходимо перший { і останній }
392
- start = text.find('{')
393
- end = text.rfind('}') + 1
394
-
395
- if start == -1 or end == 0:
396
- # Якщо JSON не знайдено, створюємо структурований JSON з тексту
397
- return json.dumps({
398
- "relevant_positions": [
399
- {
400
- "lp_id": "unknown",
401
- "source_index": "1",
402
- "description": text
403
- }
404
- ]
405
- }, ensure_ascii=False)
406
-
407
- json_str = text[start:end]
408
-
409
- # Перевіряємо, чи є це валідним JSON
410
- try:
411
- parsed_json = json.loads(json_str)
412
- if "relevant_positions" not in parsed_json:
413
- parsed_json = {
414
  "relevant_positions": [
415
  {
416
  "lp_id": "unknown",
417
  "source_index": "1",
418
- "description": json.dumps(parsed_json)
419
  }
420
  ]
421
  }
422
- return json.dumps(parsed_json, ensure_ascii=False)
423
- except json.JSONDecodeError:
424
- # Якщо не вдалося розпарсити JSON, повертаємо весь текст як опис
425
  return json.dumps({
426
  "relevant_positions": [
427
  {
428
  "lp_id": "unknown",
429
  "source_index": "1",
430
- "description": text
431
  }
432
  ]
433
  }, ensure_ascii=False)
@@ -614,28 +610,56 @@ def generate_legal_position(
614
  ChatMessage(role="system", content=system_prompt),
615
  ChatMessage(role="user", content=content),
616
  ]
617
- response = llm.chat(messages, response_format=LEGAL_POSITION_SCHEMA)
618
- return json.loads(response.message.content)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
619
 
620
  if provider == ModelProvider.DEEPSEEK.value:
621
  client = OpenAI(api_key=DEEPSEEK_API_KEY, base_url="https://api.deepseek.com")
622
- response = client.chat.completions.create(
623
- model=model_name,
624
- messages=[
625
- {"role": "system", "content": system_prompt},
626
- {"role": "user", "content": content},
627
- ],
628
- temperature=GENERATION_TEMPERATURE,
629
- max_tokens=MAX_TOKENS_CONFIG["deepseek"],
630
- response_format={
631
- 'type': 'json_object'
632
- },
633
- stream=False
634
- )
635
  try:
636
- return json.loads(response.choices[0].message.content)
637
- except json.JSONDecodeError:
638
- raise Exception("Помилка при парсингу відповіді від моделі Deepseek")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
639
 
640
  elif provider == ModelProvider.ANTHROPIC.value:
641
  client = Anthropic(api_key=ANTHROPIC_API_KEY)
@@ -707,62 +731,34 @@ def generate_legal_position(
707
  print(f"[DEBUG] Anthropic response text length: {len(response_text)}")
708
  print(f"[DEBUG] Response preview (first 500 chars): {response_text[:500]}")
709
 
710
- # Try to extract JSON from markdown code blocks if present
711
- text_to_parse = response_text.strip()
712
 
713
- # Remove markdown code blocks if present
714
- if text_to_parse.startswith("```json"):
715
- text_to_parse = text_to_parse[7:]
716
- elif text_to_parse.startswith("```"):
717
- text_to_parse = text_to_parse[3:]
718
-
719
- if text_to_parse.endswith("```"):
720
- text_to_parse = text_to_parse[:-3]
721
-
722
- text_to_parse = text_to_parse.strip()
723
-
724
- # Try to find JSON object in the text
725
- start_idx = text_to_parse.find('{')
726
- end_idx = text_to_parse.rfind('}')
727
-
728
- if start_idx != -1 and end_idx != -1:
729
- text_to_parse = text_to_parse[start_idx:end_idx + 1]
730
- else:
731
- print(f"[WARNING] No JSON object delimiters found in response")
732
-
733
- # Try to parse JSON
734
- try:
735
- parsed_json = json.loads(text_to_parse)
736
-
737
  # Validate required fields
738
  required = ["title", "text", "proceeding", "category"]
739
- missing = [f for f in required if f not in parsed_json]
740
  if missing:
741
- print(f"[WARNING] Missing fields in JSON: {missing}")
742
- # Try to fill missing fields
743
  for field in missing:
744
- if field not in parsed_json:
745
- parsed_json[field] = "Не вказано"
746
-
747
- return parsed_json
748
-
749
- except json.JSONDecodeError as je:
750
- print(f"[ERROR] JSON parsing failed: {je}")
751
- print(f"[ERROR] Attempted to parse: {text_to_parse[:1000]}")
752
-
753
  # Fallback: create structured response from raw text
754
- fallback = {
755
  "title": "Автоматично згенерований заголовок",
756
  "text": response_text.strip(),
757
  "proceeding": "Не визначено",
758
  "category": "Помилка парсингу JSON"
759
  }
760
- print(f"[WARNING] Using fallback response structure")
761
- return fallback
762
-
763
  except Exception as e:
764
- print(f"[ERROR] Exception during response processing: {type(e).__name__}: {e}")
765
- raise Exception(f"Помилка при обробці відповіді від моделі Anthropic: {str(e)}")
 
 
 
766
 
767
  elif provider == ModelProvider.GEMINI.value:
768
  if not os.environ.get("GEMINI_API_KEY"):
@@ -819,51 +815,30 @@ def generate_legal_position(
819
  if not response_text:
820
  raise Exception("Пуста відповідь від моделі Gemini")
821
 
822
- # Спробуємо розпарсити JSON
823
- try:
824
- # Try to extract JSON from markdown code blocks if present
825
- text_to_parse = response_text.strip()
826
-
827
- # Remove markdown code blocks if present
828
- if text_to_parse.startswith("```json"):
829
- text_to_parse = text_to_parse[7:] # Remove ```json
830
- elif text_to_parse.startswith("```"):
831
- text_to_parse = text_to_parse[3:] # Remove ```
832
-
833
- if text_to_parse.endswith("```"):
834
- text_to_parse = text_to_parse[:-3] # Remove trailing ```
835
-
836
- text_to_parse = text_to_parse.strip()
837
-
838
- # Try to find JSON object in the text
839
- start_idx = text_to_parse.find('{')
840
- end_idx = text_to_parse.rfind('}')
841
-
842
- if start_idx != -1 and end_idx != -1:
843
- text_to_parse = text_to_parse[start_idx:end_idx + 1]
844
-
845
- json_response = json.loads(text_to_parse)
846
-
847
  # Перевіряємо наявність всіх необхідних полів
848
  required_fields = ["title", "text", "proceeding", "category"]
849
  if all(field in json_response for field in required_fields):
850
  return json_response
851
  else:
852
  missing_fields = [field for field in required_fields if field not in json_response]
853
- raise Exception(f"Відсутні обов'язкові поля у відповіді: {', '.join(missing_fields)}")
854
-
855
- except json.JSONDecodeError as je:
856
- print(f"JSON parsing error: {str(je)}")
857
- print(f"Response text: {response_text[:500]}") # Log first 500 chars
858
- # Якщо відповідь не в форматі JSON, спробуємо створити структурований об'єкт
859
- # з текстової відповіді (fallback mechanism)
860
- fallback_response = {
 
861
  "title": "Автоматично сформований заголовок",
862
  "text": response_text.strip(),
863
  "proceeding": "Не визначено",
864
- "category": "Автоматично визначена категорія"
865
  }
866
- return fallback_response
867
 
868
  except Exception as e:
869
  print(f"Error in Gemini generation: {str(e)}")
 
44
  clean_text,
45
  extract_court_decision_text,
46
  get_links_html,
47
+ get_links_html_lp,
48
+ extract_json_from_text
49
  )
50
  from embeddings import GeminiEmbedding
51
 
 
301
  response_format=response_format,
302
  temperature=0
303
  )
304
+ response_text = response.choices[0].message.content
305
+
306
+ # Verify it's valid JSON
307
+ json_data = extract_json_from_text(response_text)
308
+ return json.dumps(json_data, ensure_ascii=False) if json_data else response_text
309
  except Exception as e:
310
  raise RuntimeError(f"Error in OpenAI analysis: {str(e)}")
311
 
312
  async def _analyze_with_deepseek(self, prompt: str) -> str:
313
+ """Analyze text using DeepSeek."""
314
  messages = [
315
  ChatMessage(role="system", content=SYSTEM_PROMPT),
316
  ChatMessage(role="user", content=prompt)
 
327
  response_format=response_format,
328
  temperature=0
329
  )
330
+ response_text = response.choices[0].message.content
331
+
332
+ # Verify and clean JSON
333
+ json_data = extract_json_from_text(response_text)
334
+ return json.dumps(json_data, ensure_ascii=False) if json_data else response_text
335
  except Exception as e:
336
  raise RuntimeError(f"Error in DeepSeek analysis: {str(e)}")
337
 
 
344
  system=SYSTEM_PROMPT,
345
  messages=[{"role": "user", "content": prompt}]
346
  )
347
+ response_text = response.content[0].text
348
+
349
+ # Extract JSON from potential markdown blocks
350
+ json_data = extract_json_from_text(response_text)
351
+ if json_data:
352
+ return json.dumps(json_data, ensure_ascii=False)
353
+ return response_text
354
  except Exception as e:
355
  raise RuntimeError(f"Error in Anthropic analysis: {str(e)}")
356
 
 
401
  if not response_text:
402
  raise RuntimeError("Empty response from Gemini")
403
 
404
+ # Витягуємо JSON з відповіді за допомогою універсальної функції
405
+ json_data = extract_json_from_text(response_text)
406
+
407
+ if json_data:
408
+ if "relevant_positions" not in json_data:
409
+ json_data = {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
410
  "relevant_positions": [
411
  {
412
  "lp_id": "unknown",
413
  "source_index": "1",
414
+ "description": json.dumps(json_data, ensure_ascii=False)
415
  }
416
  ]
417
  }
418
+ return json.dumps(json_data, ensure_ascii=False)
419
+ else:
420
+ # Якщо JSON не знайдено, створюємо структурований JSON з тексту
421
  return json.dumps({
422
  "relevant_positions": [
423
  {
424
  "lp_id": "unknown",
425
  "source_index": "1",
426
+ "description": response_text
427
  }
428
  ]
429
  }, ensure_ascii=False)
 
610
  ChatMessage(role="system", content=system_prompt),
611
  ChatMessage(role="user", content=content),
612
  ]
613
+
614
+ try:
615
+ response = llm.chat(messages, response_format=LEGAL_POSITION_SCHEMA)
616
+ response_text = response.message.content
617
+
618
+ json_response = extract_json_from_text(response_text)
619
+ if json_response and all(key in json_response for key in ["title", "text", "proceeding", "category"]):
620
+ return json_response
621
+ else:
622
+ raise ValueError(f"Invalid JSON structure from OpenAI: {response_text[:200]}...")
623
+ except Exception as e:
624
+ print(f"[ERROR] OpenAI generation/parsing failed: {e}")
625
+ return {
626
+ "title": "Автоматично сформований заголовок (OpenAI)",
627
+ "text": content[:500] if not 'response_text' in locals() else response_text,
628
+ "proceeding": "Не визначено",
629
+ "category": "Помилка парсингу"
630
+ }
631
 
632
  if provider == ModelProvider.DEEPSEEK.value:
633
  client = OpenAI(api_key=DEEPSEEK_API_KEY, base_url="https://api.deepseek.com")
 
 
 
 
 
 
 
 
 
 
 
 
 
634
  try:
635
+ response = client.chat.completions.create(
636
+ model=model_name,
637
+ messages=[
638
+ {"role": "system", "content": system_prompt},
639
+ {"role": "user", "content": content},
640
+ ],
641
+ temperature=GENERATION_TEMPERATURE,
642
+ max_tokens=MAX_TOKENS_CONFIG["deepseek"],
643
+ response_format={
644
+ 'type': 'json_object'
645
+ },
646
+ stream=False
647
+ )
648
+ response_text = response.choices[0].message.content
649
+
650
+ json_response = extract_json_from_text(response_text)
651
+ if json_response and all(key in json_response for key in ["title", "text", "proceeding", "category"]):
652
+ return json_response
653
+ else:
654
+ raise ValueError(f"Invalid JSON structure from DeepSeek: {response_text[:200]}...")
655
+ except Exception as e:
656
+ print(f"[ERROR] DeepSeek generation/parsing failed: {e}")
657
+ return {
658
+ "title": "Автоматично сформований заголовок (DeepSeek)",
659
+ "text": "Помилка при отриманні відповіді від DeepSeek",
660
+ "proceeding": "Не визначено",
661
+ "category": "Помилка API/Парсингу"
662
+ }
663
 
664
  elif provider == ModelProvider.ANTHROPIC.value:
665
  client = Anthropic(api_key=ANTHROPIC_API_KEY)
 
731
  print(f"[DEBUG] Anthropic response text length: {len(response_text)}")
732
  print(f"[DEBUG] Response preview (first 500 chars): {response_text[:500]}")
733
 
734
+ # Спробуємо розпарсити JSON за допомогою універсальної функції
735
+ json_response = extract_json_from_text(response_text)
736
 
737
+ if json_response:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
738
  # Validate required fields
739
  required = ["title", "text", "proceeding", "category"]
740
+ missing = [f for f in required if f not in json_response]
741
  if missing:
742
+ print(f"[WARNING] Missing fields in Anthropic JSON: {missing}")
 
743
  for field in missing:
744
+ if field not in json_response:
745
+ json_response[field] = "Не вказано"
746
+ return json_response
747
+ else:
748
+ print(f"[ERROR] Could not extract JSON from Anthropic response")
 
 
 
 
749
  # Fallback: create structured response from raw text
750
+ return {
751
  "title": "Автоматично згенерований заголовок",
752
  "text": response_text.strip(),
753
  "proceeding": "Не визначено",
754
  "category": "Помилка парсингу JSON"
755
  }
 
 
 
756
  except Exception as e:
757
+ # Скидання помилки для подальшого аналізу
758
+ error_details = str(e)
759
+ if hasattr(e, 'response'):
760
+ error_details += f"\nResponse: {e.response}"
761
+ raise RuntimeError(f"Error in Anthropic analysis: {error_details}")
762
 
763
  elif provider == ModelProvider.GEMINI.value:
764
  if not os.environ.get("GEMINI_API_KEY"):
 
815
  if not response_text:
816
  raise Exception("Пуста відповідь від моделі Gemini")
817
 
818
+ # Спробуємо розпарсити JSON за допомогою універсальної функції
819
+ json_response = extract_json_from_text(response_text)
820
+
821
+ if json_response:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
822
  # Перевіряємо наявність всіх необхідних полів
823
  required_fields = ["title", "text", "proceeding", "category"]
824
  if all(field in json_response for field in required_fields):
825
  return json_response
826
  else:
827
  missing_fields = [field for field in required_fields if field not in json_response]
828
+ print(f"[WARNING] Gemini response missing fields: {missing_fields}")
829
+ # Fallback for missing fields
830
+ for field in required_fields:
831
+ if field not in json_response:
832
+ json_response[field] = "Не визначено"
833
+ return json_response
834
+ else:
835
+ print(f"[ERROR] Could not extract JSON from Gemini response: {response_text[:300]}...")
836
+ return {
837
  "title": "Автоматично сформований заголовок",
838
  "text": response_text.strip(),
839
  "proceeding": "Не визначено",
840
+ "category": "Помилка парсингу"
841
  }
 
842
 
843
  except Exception as e:
844
  print(f"Error in Gemini generation: {str(e)}")
utils.py CHANGED
@@ -140,4 +140,46 @@ def get_links_html_lp(lp_ids: Union[str, int, None]) -> str:
140
  return ""
141
  links = [f"[ПП ВС: {lp_id}](https://lpd.court.gov.ua/home/search/{lp_id})"
142
  for lp_id in parsed_ids]
143
- return ", ".join(links)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
140
  return ""
141
  links = [f"[ПП ВС: {lp_id}](https://lpd.court.gov.ua/home/search/{lp_id})"
142
  for lp_id in parsed_ids]
143
+ return ", ".join(links)
144
+
145
+ def extract_json_from_text(text: str) -> Optional[Dict]:
146
+ """Extract and parse JSON from text, handling markdown blocks and other noise."""
147
+ if not text:
148
+ return None
149
+
150
+ try:
151
+ # 1. Try direct parsing
152
+ return json.loads(text.strip())
153
+ except json.JSONDecodeError:
154
+ pass
155
+
156
+ # 2. Try to find JSON within markdown or other text
157
+ text_to_parse = text.strip()
158
+
159
+ # Remove markdown code blocks
160
+ if "```json" in text_to_parse:
161
+ parts = text_to_parse.split("```json")
162
+ if len(parts) > 1:
163
+ text_to_parse = parts[1].split("```")[0].strip()
164
+ elif "```" in text_to_parse:
165
+ parts = text_to_parse.split("```")
166
+ if len(parts) > 1:
167
+ text_to_parse = parts[1].strip()
168
+
169
+ try:
170
+ return json.loads(text_to_parse)
171
+ except json.JSONDecodeError:
172
+ pass
173
+
174
+ # 3. Last resort: find the first { and last }
175
+ start_idx = text_to_parse.find('{')
176
+ end_idx = text_to_parse.rfind('}')
177
+
178
+ if start_idx != -1 and end_idx != -1:
179
+ text_to_parse = text_to_parse[start_idx:end_idx + 1]
180
+ try:
181
+ return json.loads(text_to_parse)
182
+ except json.JSONDecodeError:
183
+ pass
184
+
185
+ return None