Spaces:
Sleeping
Sleeping
Fix Greek OCR and update Latin OCR model
Browse files- models/groq_client.py +11 -7
- processors/greek_processor.py +70 -26
models/groq_client.py
CHANGED
|
@@ -23,7 +23,7 @@ class GroqClient:
|
|
| 23 |
"""Check if Groq API client is available and configured"""
|
| 24 |
return self.client is not None
|
| 25 |
|
| 26 |
-
def generate_response(self, system_prompt: str, user_prompt: str, max_tokens: int = 1024) -> str:
|
| 27 |
"""Generate response from Groq LLM"""
|
| 28 |
if not self.is_available():
|
| 29 |
print("[WARN] GroqClient not available for generating response.")
|
|
@@ -36,15 +36,19 @@ class GroqClient:
|
|
| 36 |
if model == "openai/gpt-oss-120b":
|
| 37 |
model = "llama-3.1-8b-instant" # standard Groq model
|
| 38 |
|
| 39 |
-
|
| 40 |
-
model
|
| 41 |
-
messages
|
| 42 |
{"role": "system", "content": system_prompt},
|
| 43 |
{"role": "user", "content": user_prompt}
|
| 44 |
],
|
| 45 |
-
temperature
|
| 46 |
-
max_completion_tokens
|
| 47 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
return completion.choices[0].message.content
|
| 49 |
except Exception as e:
|
| 50 |
print(f"[ERROR] Groq API call failed: {e}")
|
|
|
|
| 23 |
"""Check if Groq API client is available and configured"""
|
| 24 |
return self.client is not None
|
| 25 |
|
| 26 |
+
def generate_response(self, system_prompt: str, user_prompt: str, max_tokens: int = 1024, response_format = None) -> str:
|
| 27 |
"""Generate response from Groq LLM"""
|
| 28 |
if not self.is_available():
|
| 29 |
print("[WARN] GroqClient not available for generating response.")
|
|
|
|
| 36 |
if model == "openai/gpt-oss-120b":
|
| 37 |
model = "llama-3.1-8b-instant" # standard Groq model
|
| 38 |
|
| 39 |
+
params = {
|
| 40 |
+
"model": model,
|
| 41 |
+
"messages": [
|
| 42 |
{"role": "system", "content": system_prompt},
|
| 43 |
{"role": "user", "content": user_prompt}
|
| 44 |
],
|
| 45 |
+
"temperature": getattr(self.config, 'GROQ_TEMPERATURE', 0.7),
|
| 46 |
+
"max_completion_tokens": max_tokens,
|
| 47 |
+
}
|
| 48 |
+
if response_format is not None:
|
| 49 |
+
params["response_format"] = response_format
|
| 50 |
+
|
| 51 |
+
completion = self.client.chat.completions.create(**params)
|
| 52 |
return completion.choices[0].message.content
|
| 53 |
except Exception as e:
|
| 54 |
print(f"[ERROR] Groq API call failed: {e}")
|
processors/greek_processor.py
CHANGED
|
@@ -25,6 +25,11 @@ class GreekProcessor(BaseScriptProcessor):
|
|
| 25 |
# Register for dynamic VRAM management
|
| 26 |
from utils.gpu_diagnostics import register_processor
|
| 27 |
register_processor("greek", self)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
|
| 29 |
def setup_greek_trocr(self):
|
| 30 |
"""Setup TrOCR model — BEST for ancient Greek manuscripts"""
|
|
@@ -554,50 +559,77 @@ class GreekProcessor(BaseScriptProcessor):
|
|
| 554 |
|
| 555 |
system_prompt = (
|
| 556 |
"You are an expert classicist and lexicographer of Ancient Greek. "
|
| 557 |
-
"
|
| 558 |
-
"
|
| 559 |
)
|
| 560 |
user_prompt = (
|
| 561 |
-
f"For each of the following Ancient Greek words, provide a
|
| 562 |
-
f"etymological note,
|
| 563 |
f"Words: {terms_list}\n\n"
|
| 564 |
-
f"
|
| 565 |
-
f"
|
| 566 |
-
f"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 567 |
)
|
| 568 |
|
| 569 |
try:
|
| 570 |
raw_response = self.groq_client.generate_response(
|
| 571 |
system_prompt=system_prompt,
|
| 572 |
user_prompt=user_prompt,
|
| 573 |
-
max_tokens=2048
|
|
|
|
| 574 |
)
|
| 575 |
# Safe print to avoid UnicodeEncodeError in Windows command prompt
|
| 576 |
print(f"[INFO] Groq glossary raw response: {raw_response.encode('ascii', 'backslashreplace').decode()}")
|
| 577 |
|
| 578 |
# Find JSON block in response
|
| 579 |
-
|
| 580 |
-
|
| 581 |
-
|
| 582 |
-
|
| 583 |
-
|
| 584 |
-
|
| 585 |
-
|
| 586 |
-
|
| 587 |
-
|
| 588 |
-
|
| 589 |
-
|
| 590 |
-
|
| 591 |
-
|
| 592 |
-
|
| 593 |
-
|
| 594 |
-
|
| 595 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 596 |
except Exception as e:
|
| 597 |
print(f"[WARN] Failed to generate batch Greek explanations: {e}")
|
| 598 |
|
| 599 |
return {}
|
| 600 |
-
|
| 601 |
def _build_uses_list(self, terms, greek_text):
|
| 602 |
"""Build list of symbol/word uses using RAG and batch Groq explanations"""
|
| 603 |
import unicodedata
|
|
@@ -652,6 +684,18 @@ class GreekProcessor(BaseScriptProcessor):
|
|
| 652 |
definition = definitions.get(term)
|
| 653 |
if not definition:
|
| 654 |
definition = f"Ancient Greek lexical term. Characterized by specific diacritics and phonological values."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 655 |
items.append(f"{term}: {definition}")
|
| 656 |
|
| 657 |
# 2. Add significant paleographical/character markers found in the text if they are in the references
|
|
|
|
| 25 |
# Register for dynamic VRAM management
|
| 26 |
from utils.gpu_diagnostics import register_processor
|
| 27 |
register_processor("greek", self)
|
| 28 |
+
|
| 29 |
+
# Metrics for Greek Glossary
|
| 30 |
+
self.glossary_success_count = 0
|
| 31 |
+
self.glossary_json_failure_count = 0
|
| 32 |
+
self.regex_recovery_count = 0
|
| 33 |
|
| 34 |
def setup_greek_trocr(self):
|
| 35 |
"""Setup TrOCR model — BEST for ancient Greek manuscripts"""
|
|
|
|
| 559 |
|
| 560 |
system_prompt = (
|
| 561 |
"You are an expert classicist and lexicographer of Ancient Greek. "
|
| 562 |
+
"Return ONLY valid JSON matching the requested schema. "
|
| 563 |
+
"No markdown, no code fences (like ```json), no explanations, no prose."
|
| 564 |
)
|
| 565 |
user_prompt = (
|
| 566 |
+
f"For each of the following Ancient Greek words, provide a scholarly definition, "
|
| 567 |
+
f"etymological note, and grammatical gloss:\n\n"
|
| 568 |
f"Words: {terms_list}\n\n"
|
| 569 |
+
f"You MUST format the output as a single JSON object where the keys are the exact words "
|
| 570 |
+
f"and the values are objects containing 'definition', 'gloss', and 'etymology' keys.\n\n"
|
| 571 |
+
f"Output schema:\n"
|
| 572 |
+
f"{{\n"
|
| 573 |
+
f" \"TERM\": {{\n"
|
| 574 |
+
f" \"definition\": \"...\",\n"
|
| 575 |
+
f" \"gloss\": \"...\",\n"
|
| 576 |
+
f" \"etymology\": \"...\"\n"
|
| 577 |
+
f" }}\n"
|
| 578 |
+
f"}}\n"
|
| 579 |
)
|
| 580 |
|
| 581 |
try:
|
| 582 |
raw_response = self.groq_client.generate_response(
|
| 583 |
system_prompt=system_prompt,
|
| 584 |
user_prompt=user_prompt,
|
| 585 |
+
max_tokens=2048,
|
| 586 |
+
response_format={"type": "json_object"}
|
| 587 |
)
|
| 588 |
# Safe print to avoid UnicodeEncodeError in Windows command prompt
|
| 589 |
print(f"[INFO] Groq glossary raw response: {raw_response.encode('ascii', 'backslashreplace').decode()}")
|
| 590 |
|
| 591 |
# Find JSON block in response
|
| 592 |
+
json_str = raw_response.strip()
|
| 593 |
+
if "{" in json_str and "}" in json_str:
|
| 594 |
+
start = json_str.find("{")
|
| 595 |
+
end = json_str.rfind("}") + 1
|
| 596 |
+
json_str = json_str[start:end]
|
| 597 |
+
|
| 598 |
+
import json
|
| 599 |
+
definitions = {}
|
| 600 |
+
try:
|
| 601 |
+
definitions = json.loads(json_str)
|
| 602 |
+
self.glossary_success_count += 1
|
| 603 |
+
except Exception as je:
|
| 604 |
+
self.glossary_json_failure_count += 1
|
| 605 |
+
import logging
|
| 606 |
+
logger = logging.getLogger(__name__)
|
| 607 |
+
logger.warning(
|
| 608 |
+
"Malformed Greek glossary JSON",
|
| 609 |
+
extra={"response": raw_response[:2000]}
|
| 610 |
+
)
|
| 611 |
+
print(f"[WARN] Standard JSON load failed: {je}. Attempting regex recovery...")
|
| 612 |
+
|
| 613 |
+
# Regex recovery fallback
|
| 614 |
+
import re
|
| 615 |
+
self.regex_recovery_count += 1
|
| 616 |
+
term_blocks = re.findall(r'"([^"]+)"\s*:\s*\{([^}]+)\}', json_str)
|
| 617 |
+
for term, block in term_blocks:
|
| 618 |
+
def_match = re.search(r'"definition"\s*:\s*["\']([^"\']+)["\']', block)
|
| 619 |
+
gloss_match = re.search(r'"gloss"\s*:\s*["\']([^"\']+)["\']', block)
|
| 620 |
+
ety_match = re.search(r'"etymology"\s*:\s*["\']([^"\']+)["\']', block)
|
| 621 |
+
definitions[term] = {
|
| 622 |
+
"definition": def_match.group(1) if def_match else "",
|
| 623 |
+
"gloss": gloss_match.group(1) if gloss_match else "",
|
| 624 |
+
"etymology": ety_match.group(1) if ety_match else ""
|
| 625 |
+
}
|
| 626 |
+
|
| 627 |
+
return definitions
|
| 628 |
except Exception as e:
|
| 629 |
print(f"[WARN] Failed to generate batch Greek explanations: {e}")
|
| 630 |
|
| 631 |
return {}
|
| 632 |
+
|
| 633 |
def _build_uses_list(self, terms, greek_text):
|
| 634 |
"""Build list of symbol/word uses using RAG and batch Groq explanations"""
|
| 635 |
import unicodedata
|
|
|
|
| 684 |
definition = definitions.get(term)
|
| 685 |
if not definition:
|
| 686 |
definition = f"Ancient Greek lexical term. Characterized by specific diacritics and phonological values."
|
| 687 |
+
elif isinstance(definition, dict):
|
| 688 |
+
parts = []
|
| 689 |
+
d_val = definition.get("definition", "").strip()
|
| 690 |
+
g_val = definition.get("gloss", "").strip()
|
| 691 |
+
e_val = definition.get("etymology", "").strip()
|
| 692 |
+
if d_val:
|
| 693 |
+
parts.append(d_val)
|
| 694 |
+
if g_val:
|
| 695 |
+
parts.append(f"Gloss: {g_val}")
|
| 696 |
+
if e_val:
|
| 697 |
+
parts.append(f"Etymology: {e_val}")
|
| 698 |
+
definition = " | ".join(parts) if parts else "Ancient Greek lexical term."
|
| 699 |
items.append(f"{term}: {definition}")
|
| 700 |
|
| 701 |
# 2. Add significant paleographical/character markers found in the text if they are in the references
|