Akshay30 commited on
Commit
62db04d
·
1 Parent(s): 36331c6

Fix Greek OCR and update Latin OCR model

Browse files
models/groq_client.py CHANGED
@@ -23,7 +23,7 @@ class GroqClient:
23
  """Check if Groq API client is available and configured"""
24
  return self.client is not None
25
 
26
- def generate_response(self, system_prompt: str, user_prompt: str, max_tokens: int = 1024) -> str:
27
  """Generate response from Groq LLM"""
28
  if not self.is_available():
29
  print("[WARN] GroqClient not available for generating response.")
@@ -36,15 +36,19 @@ class GroqClient:
36
  if model == "openai/gpt-oss-120b":
37
  model = "llama-3.1-8b-instant" # standard Groq model
38
 
39
- completion = self.client.chat.completions.create(
40
- model=model,
41
- messages=[
42
  {"role": "system", "content": system_prompt},
43
  {"role": "user", "content": user_prompt}
44
  ],
45
- temperature=getattr(self.config, 'GROQ_TEMPERATURE', 0.7),
46
- max_completion_tokens=max_tokens,
47
- )
 
 
 
 
48
  return completion.choices[0].message.content
49
  except Exception as e:
50
  print(f"[ERROR] Groq API call failed: {e}")
 
23
  """Check if Groq API client is available and configured"""
24
  return self.client is not None
25
 
26
+ def generate_response(self, system_prompt: str, user_prompt: str, max_tokens: int = 1024, response_format = None) -> str:
27
  """Generate response from Groq LLM"""
28
  if not self.is_available():
29
  print("[WARN] GroqClient not available for generating response.")
 
36
  if model == "openai/gpt-oss-120b":
37
  model = "llama-3.1-8b-instant" # standard Groq model
38
 
39
+ params = {
40
+ "model": model,
41
+ "messages": [
42
  {"role": "system", "content": system_prompt},
43
  {"role": "user", "content": user_prompt}
44
  ],
45
+ "temperature": getattr(self.config, 'GROQ_TEMPERATURE', 0.7),
46
+ "max_completion_tokens": max_tokens,
47
+ }
48
+ if response_format is not None:
49
+ params["response_format"] = response_format
50
+
51
+ completion = self.client.chat.completions.create(**params)
52
  return completion.choices[0].message.content
53
  except Exception as e:
54
  print(f"[ERROR] Groq API call failed: {e}")
processors/greek_processor.py CHANGED
@@ -25,6 +25,11 @@ class GreekProcessor(BaseScriptProcessor):
25
  # Register for dynamic VRAM management
26
  from utils.gpu_diagnostics import register_processor
27
  register_processor("greek", self)
 
 
 
 
 
28
 
29
  def setup_greek_trocr(self):
30
  """Setup TrOCR model — BEST for ancient Greek manuscripts"""
@@ -554,50 +559,77 @@ class GreekProcessor(BaseScriptProcessor):
554
 
555
  system_prompt = (
556
  "You are an expert classicist and lexicographer of Ancient Greek. "
557
- "Respond ONLY with a JSON object. Do NOT wrap values in double quotes inside the strings. "
558
- "Use single quotes '...' for any internal quotes, definitions, or translations."
559
  )
560
  user_prompt = (
561
- f"For each of the following Ancient Greek words, provide a brief, scholarly one-sentence definition, "
562
- f"etymological note, or grammatical gloss:\n\n"
563
  f"Words: {terms_list}\n\n"
564
- f"Respond ONLY with a JSON object where the keys are the exact words and the values are the definitions.\n"
565
- f"Do NOT use double quotes inside the definitions/values; use single quotes instead.\n"
566
- f"Example: {{\"word1\": \"definition1\", \"word2\": \"definition2\"}}"
 
 
 
 
 
 
 
567
  )
568
 
569
  try:
570
  raw_response = self.groq_client.generate_response(
571
  system_prompt=system_prompt,
572
  user_prompt=user_prompt,
573
- max_tokens=2048
 
574
  )
575
  # Safe print to avoid UnicodeEncodeError in Windows command prompt
576
  print(f"[INFO] Groq glossary raw response: {raw_response.encode('ascii', 'backslashreplace').decode()}")
577
 
578
  # Find JSON block in response
579
- if "{" in raw_response and "}" in raw_response:
580
- start = raw_response.find("{")
581
- end = raw_response.rfind("}") + 1
582
- json_str = raw_response[start:end]
583
- import json
584
- try:
585
- definitions = json.loads(json_str)
586
- except Exception as je:
587
- print(f"[WARN] Standard JSON load failed: {je}. Attempting regex recovery...")
588
- import re
589
- definitions = {}
590
- # Matches "key": "value"
591
- pattern = re.compile(r'"([^"]+)":\s*"((?:[^"\\]|\\.)*)"')
592
- matches = pattern.findall(json_str)
593
- for k, v in matches:
594
- definitions[k] = v
595
- return {k: str(v) for k, v in definitions.items()}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
596
  except Exception as e:
597
  print(f"[WARN] Failed to generate batch Greek explanations: {e}")
598
 
599
  return {}
600
-
601
  def _build_uses_list(self, terms, greek_text):
602
  """Build list of symbol/word uses using RAG and batch Groq explanations"""
603
  import unicodedata
@@ -652,6 +684,18 @@ class GreekProcessor(BaseScriptProcessor):
652
  definition = definitions.get(term)
653
  if not definition:
654
  definition = f"Ancient Greek lexical term. Characterized by specific diacritics and phonological values."
 
 
 
 
 
 
 
 
 
 
 
 
655
  items.append(f"{term}: {definition}")
656
 
657
  # 2. Add significant paleographical/character markers found in the text if they are in the references
 
25
  # Register for dynamic VRAM management
26
  from utils.gpu_diagnostics import register_processor
27
  register_processor("greek", self)
28
+
29
+ # Metrics for Greek Glossary
30
+ self.glossary_success_count = 0
31
+ self.glossary_json_failure_count = 0
32
+ self.regex_recovery_count = 0
33
 
34
  def setup_greek_trocr(self):
35
  """Setup TrOCR model — BEST for ancient Greek manuscripts"""
 
559
 
560
  system_prompt = (
561
  "You are an expert classicist and lexicographer of Ancient Greek. "
562
+ "Return ONLY valid JSON matching the requested schema. "
563
+ "No markdown, no code fences (like ```json), no explanations, no prose."
564
  )
565
  user_prompt = (
566
+ f"For each of the following Ancient Greek words, provide a scholarly definition, "
567
+ f"etymological note, and grammatical gloss:\n\n"
568
  f"Words: {terms_list}\n\n"
569
+ f"You MUST format the output as a single JSON object where the keys are the exact words "
570
+ f"and the values are objects containing 'definition', 'gloss', and 'etymology' keys.\n\n"
571
+ f"Output schema:\n"
572
+ f"{{\n"
573
+ f" \"TERM\": {{\n"
574
+ f" \"definition\": \"...\",\n"
575
+ f" \"gloss\": \"...\",\n"
576
+ f" \"etymology\": \"...\"\n"
577
+ f" }}\n"
578
+ f"}}\n"
579
  )
580
 
581
  try:
582
  raw_response = self.groq_client.generate_response(
583
  system_prompt=system_prompt,
584
  user_prompt=user_prompt,
585
+ max_tokens=2048,
586
+ response_format={"type": "json_object"}
587
  )
588
  # Safe print to avoid UnicodeEncodeError in Windows command prompt
589
  print(f"[INFO] Groq glossary raw response: {raw_response.encode('ascii', 'backslashreplace').decode()}")
590
 
591
  # Find JSON block in response
592
+ json_str = raw_response.strip()
593
+ if "{" in json_str and "}" in json_str:
594
+ start = json_str.find("{")
595
+ end = json_str.rfind("}") + 1
596
+ json_str = json_str[start:end]
597
+
598
+ import json
599
+ definitions = {}
600
+ try:
601
+ definitions = json.loads(json_str)
602
+ self.glossary_success_count += 1
603
+ except Exception as je:
604
+ self.glossary_json_failure_count += 1
605
+ import logging
606
+ logger = logging.getLogger(__name__)
607
+ logger.warning(
608
+ "Malformed Greek glossary JSON",
609
+ extra={"response": raw_response[:2000]}
610
+ )
611
+ print(f"[WARN] Standard JSON load failed: {je}. Attempting regex recovery...")
612
+
613
+ # Regex recovery fallback
614
+ import re
615
+ self.regex_recovery_count += 1
616
+ term_blocks = re.findall(r'"([^"]+)"\s*:\s*\{([^}]+)\}', json_str)
617
+ for term, block in term_blocks:
618
+ def_match = re.search(r'"definition"\s*:\s*["\']([^"\']+)["\']', block)
619
+ gloss_match = re.search(r'"gloss"\s*:\s*["\']([^"\']+)["\']', block)
620
+ ety_match = re.search(r'"etymology"\s*:\s*["\']([^"\']+)["\']', block)
621
+ definitions[term] = {
622
+ "definition": def_match.group(1) if def_match else "",
623
+ "gloss": gloss_match.group(1) if gloss_match else "",
624
+ "etymology": ety_match.group(1) if ety_match else ""
625
+ }
626
+
627
+ return definitions
628
  except Exception as e:
629
  print(f"[WARN] Failed to generate batch Greek explanations: {e}")
630
 
631
  return {}
632
+
633
  def _build_uses_list(self, terms, greek_text):
634
  """Build list of symbol/word uses using RAG and batch Groq explanations"""
635
  import unicodedata
 
684
  definition = definitions.get(term)
685
  if not definition:
686
  definition = f"Ancient Greek lexical term. Characterized by specific diacritics and phonological values."
687
+ elif isinstance(definition, dict):
688
+ parts = []
689
+ d_val = definition.get("definition", "").strip()
690
+ g_val = definition.get("gloss", "").strip()
691
+ e_val = definition.get("etymology", "").strip()
692
+ if d_val:
693
+ parts.append(d_val)
694
+ if g_val:
695
+ parts.append(f"Gloss: {g_val}")
696
+ if e_val:
697
+ parts.append(f"Etymology: {e_val}")
698
+ definition = " | ".join(parts) if parts else "Ancient Greek lexical term."
699
  items.append(f"{term}: {definition}")
700
 
701
  # 2. Add significant paleographical/character markers found in the text if they are in the references