Spaces:
Running
Running
Upload 4 files
Browse files- app.py +45 -13
- authgpt_auth.py +44 -0
- extract_glossary_from_epub.py +14 -2
app.py
CHANGED
|
@@ -623,7 +623,7 @@ class GlossarionWeb:
|
|
| 623 |
os.environ['MANUAL_GLOSSARY_MAX_TITLES'] = str(config('manual_glossary_max_titles', 30))
|
| 624 |
os.environ['GLOSSARY_MAX_TEXT_SIZE'] = str(config('glossary_max_text_size', 0))
|
| 625 |
os.environ['GLOSSARY_MAX_SENTENCES'] = str(config('glossary_max_sentences', 200))
|
| 626 |
-
os.environ['GLOSSARY_CHAPTER_SPLIT_THRESHOLD'] = str(config('glossary_chapter_split_threshold',
|
| 627 |
os.environ['MANUAL_GLOSSARY_FILTER_MODE'] = config('manual_glossary_filter_mode', 'all')
|
| 628 |
os.environ['STRIP_HONORIFICS'] = '1' if config('strip_honorifics', True) else '0'
|
| 629 |
os.environ['MANUAL_GLOSSARY_FUZZY_THRESHOLD'] = str(config('manual_glossary_fuzzy_threshold', 0.90))
|
|
@@ -711,8 +711,19 @@ class GlossarionWeb:
|
|
| 711 |
# Output language
|
| 712 |
os.environ['OUTPUT_LANGUAGE'] = config('output_language', 'English')
|
| 713 |
|
| 714 |
-
# Glossary compression
|
| 715 |
-
os.environ['COMPRESS_GLOSSARY_PROMPT'] = '1' if config('compress_glossary_prompt',
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 716 |
|
| 717 |
# Additional glossary
|
| 718 |
os.environ['ADD_ADDITIONAL_GLOSSARY'] = '1' if config('add_additional_glossary', False) else '0'
|
|
@@ -1416,6 +1427,7 @@ class GlossarionWeb:
|
|
| 1416 |
yield None, None, gr.update(visible=True), "\n".join(extraction_logs), gr.update(visible=True), "Starting...", 0
|
| 1417 |
|
| 1418 |
input_path = epub_file.name if hasattr(epub_file, 'name') else epub_file
|
|
|
|
| 1419 |
output_path = input_path.replace('.epub', '_glossary.csv')
|
| 1420 |
|
| 1421 |
extraction_logs.append(f"📖 Input: {os.path.basename(input_path)}")
|
|
@@ -1425,19 +1437,28 @@ class GlossarionWeb:
|
|
| 1425 |
# Set all environment variables from config
|
| 1426 |
self.set_all_environment_variables()
|
| 1427 |
|
| 1428 |
-
# Set API key
|
| 1429 |
-
|
| 1430 |
-
|
| 1431 |
-
|
| 1432 |
-
|
| 1433 |
-
|
| 1434 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1435 |
|
| 1436 |
extraction_logs.append("📋 Extracting text from EPUB...")
|
| 1437 |
yield None, None, gr.update(visible=True), "\n".join(extraction_logs), gr.update(visible=True), "Extracting text...", 20
|
| 1438 |
|
| 1439 |
# Set environment variables for glossary extraction
|
| 1440 |
os.environ['MODEL'] = model
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1441 |
os.environ['GLOSSARY_MIN_FREQUENCY'] = str(min_frequency)
|
| 1442 |
os.environ['GLOSSARY_MAX_NAMES'] = str(max_names)
|
| 1443 |
os.environ['GLOSSARY_MAX_TITLES'] = str(max_titles)
|
|
@@ -1550,10 +1571,21 @@ class GlossarionWeb:
|
|
| 1550 |
extraction_logs.append("🖍️ Writing glossary to CSV...")
|
| 1551 |
yield None, None, gr.update(visible=True), "\n".join(extraction_logs), gr.update(visible=True), "Writing CSV...", 95
|
| 1552 |
|
| 1553 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1554 |
extraction_logs.append(f"✅ Glossary extracted successfully!")
|
| 1555 |
-
extraction_logs.append(f"💾 Saved to: {os.path.basename(
|
| 1556 |
-
yield
|
| 1557 |
else:
|
| 1558 |
extraction_logs.append("❌ Glossary extraction failed - output file not created")
|
| 1559 |
yield None, None, gr.update(visible=False), "\n".join(extraction_logs), gr.update(visible=True), "Extraction failed", 0
|
|
|
|
| 623 |
os.environ['MANUAL_GLOSSARY_MAX_TITLES'] = str(config('manual_glossary_max_titles', 30))
|
| 624 |
os.environ['GLOSSARY_MAX_TEXT_SIZE'] = str(config('glossary_max_text_size', 0))
|
| 625 |
os.environ['GLOSSARY_MAX_SENTENCES'] = str(config('glossary_max_sentences', 200))
|
| 626 |
+
os.environ['GLOSSARY_CHAPTER_SPLIT_THRESHOLD'] = str(config('glossary_chapter_split_threshold', 0))
|
| 627 |
os.environ['MANUAL_GLOSSARY_FILTER_MODE'] = config('manual_glossary_filter_mode', 'all')
|
| 628 |
os.environ['STRIP_HONORIFICS'] = '1' if config('strip_honorifics', True) else '0'
|
| 629 |
os.environ['MANUAL_GLOSSARY_FUZZY_THRESHOLD'] = str(config('manual_glossary_fuzzy_threshold', 0.90))
|
|
|
|
| 711 |
# Output language
|
| 712 |
os.environ['OUTPUT_LANGUAGE'] = config('output_language', 'English')
|
| 713 |
|
| 714 |
+
# Glossary compression (enabled by default)
|
| 715 |
+
os.environ['COMPRESS_GLOSSARY_PROMPT'] = '1' if config('compress_glossary_prompt', True) else '0'
|
| 716 |
+
|
| 717 |
+
# Dynamic limit expansion (enabled by default)
|
| 718 |
+
os.environ['GLOSSARY_INCLUDE_ALL_CHARACTERS'] = '1' if config('glossary_include_all_characters', True) else '0'
|
| 719 |
+
|
| 720 |
+
# Auto glossary prompt from config (prevents fallback to hard-coded default)
|
| 721 |
+
auto_gloss_prompt = config('unified_auto_glosary_prompt3', '')
|
| 722 |
+
if auto_gloss_prompt:
|
| 723 |
+
os.environ['AUTO_GLOSSARY_PROMPT'] = auto_gloss_prompt
|
| 724 |
+
|
| 725 |
+
# Output token limit
|
| 726 |
+
os.environ['MAX_OUTPUT_TOKENS'] = str(config('max_output_tokens', 128000))
|
| 727 |
|
| 728 |
# Additional glossary
|
| 729 |
os.environ['ADD_ADDITIONAL_GLOSSARY'] = '1' if config('add_additional_glossary', False) else '0'
|
|
|
|
| 1427 |
yield None, None, gr.update(visible=True), "\n".join(extraction_logs), gr.update(visible=True), "Starting...", 0
|
| 1428 |
|
| 1429 |
input_path = epub_file.name if hasattr(epub_file, 'name') else epub_file
|
| 1430 |
+
output_json_path = input_path.replace('.epub', '_glossary.json')
|
| 1431 |
output_path = input_path.replace('.epub', '_glossary.csv')
|
| 1432 |
|
| 1433 |
extraction_logs.append(f"📖 Input: {os.path.basename(input_path)}")
|
|
|
|
| 1437 |
# Set all environment variables from config
|
| 1438 |
self.set_all_environment_variables()
|
| 1439 |
|
| 1440 |
+
# Set API key (set all common env vars for unified_api_client compatibility)
|
| 1441 |
+
os.environ['API_KEY'] = api_key
|
| 1442 |
+
os.environ['OPENAI_API_KEY'] = api_key
|
| 1443 |
+
os.environ['OPENAI_OR_Gemini_API_KEY'] = api_key
|
| 1444 |
+
os.environ['GEMINI_API_KEY'] = api_key
|
| 1445 |
+
|
| 1446 |
+
# Enable streaming logs so extraction shows real-time API output
|
| 1447 |
+
os.environ['ENABLE_STREAMING'] = '1'
|
| 1448 |
+
os.environ['LOG_STREAM_CHUNKS'] = '1'
|
| 1449 |
+
os.environ['ALLOW_BATCH_STREAM_LOGS'] = '1'
|
| 1450 |
+
os.environ['ALLOW_AUTHGPT_BATCH_STREAM_LOGS'] = '1'
|
| 1451 |
|
| 1452 |
extraction_logs.append("📋 Extracting text from EPUB...")
|
| 1453 |
yield None, None, gr.update(visible=True), "\n".join(extraction_logs), gr.update(visible=True), "Extracting text...", 20
|
| 1454 |
|
| 1455 |
# Set environment variables for glossary extraction
|
| 1456 |
os.environ['MODEL'] = model
|
| 1457 |
+
|
| 1458 |
+
# CRITICAL: Set EPUB_PATH and OUTPUT_PATH for extract_glossary_from_epub.main() GUI mode
|
| 1459 |
+
os.environ['EPUB_PATH'] = input_path
|
| 1460 |
+
os.environ['OUTPUT_PATH'] = output_json_path
|
| 1461 |
+
|
| 1462 |
os.environ['GLOSSARY_MIN_FREQUENCY'] = str(min_frequency)
|
| 1463 |
os.environ['GLOSSARY_MAX_NAMES'] = str(max_names)
|
| 1464 |
os.environ['GLOSSARY_MAX_TITLES'] = str(max_titles)
|
|
|
|
| 1571 |
extraction_logs.append("🖍️ Writing glossary to CSV...")
|
| 1572 |
yield None, None, gr.update(visible=True), "\n".join(extraction_logs), gr.update(visible=True), "Writing CSV...", 95
|
| 1573 |
|
| 1574 |
+
# The extract_glossary_from_epub module saves CSV inside a Glossary/ subfolder
|
| 1575 |
+
glossary_dir = os.path.join(os.path.dirname(output_json_path), "Glossary")
|
| 1576 |
+
glossary_csv_in_subdir = os.path.join(glossary_dir, os.path.basename(output_path))
|
| 1577 |
+
|
| 1578 |
+
# Check multiple possible output locations
|
| 1579 |
+
found_output = None
|
| 1580 |
+
for candidate in [glossary_csv_in_subdir, output_path, output_json_path]:
|
| 1581 |
+
if os.path.exists(candidate):
|
| 1582 |
+
found_output = candidate
|
| 1583 |
+
break
|
| 1584 |
+
|
| 1585 |
+
if found_output:
|
| 1586 |
extraction_logs.append(f"✅ Glossary extracted successfully!")
|
| 1587 |
+
extraction_logs.append(f"💾 Saved to: {os.path.basename(found_output)}")
|
| 1588 |
+
yield found_output, gr.update(visible=True), gr.update(visible=False), "\n".join(extraction_logs), gr.update(visible=True), "Extraction complete!", 100
|
| 1589 |
else:
|
| 1590 |
extraction_logs.append("❌ Glossary extraction failed - output file not created")
|
| 1591 |
yield None, None, gr.update(visible=False), "\n".join(extraction_logs), gr.update(visible=True), "Extraction failed", 0
|
authgpt_auth.py
CHANGED
|
@@ -459,6 +459,50 @@ class AuthGPTTokenStore:
|
|
| 459 |
"Run the OAuth login flow first."
|
| 460 |
)
|
| 461 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 462 |
print("🔄 AuthGPT: No valid token found – starting browser login…")
|
| 463 |
new_tokens = run_oauth_flow()
|
| 464 |
self.save_tokens(new_tokens)
|
|
|
|
| 459 |
"Run the OAuth login flow first."
|
| 460 |
)
|
| 461 |
|
| 462 |
+
# Detect headless environments (HF Spaces, Docker, etc.) where browser login is impossible
|
| 463 |
+
is_headless = (
|
| 464 |
+
os.environ.get("SPACE_ID") is not None
|
| 465 |
+
or os.environ.get("HF_SPACES") == "true"
|
| 466 |
+
or os.environ.get("DOCKER_CONTAINER") == "true"
|
| 467 |
+
or os.environ.get("KUBERNETES_SERVICE_HOST") is not None
|
| 468 |
+
)
|
| 469 |
+
if is_headless:
|
| 470 |
+
# Check for manually-provided tokens via environment variables
|
| 471 |
+
env_access = os.environ.get("AUTHGPT_ACCESS_TOKEN", "").strip()
|
| 472 |
+
env_refresh = os.environ.get("AUTHGPT_REFRESH_TOKEN", "").strip()
|
| 473 |
+
if env_access:
|
| 474 |
+
# User provided an access token directly — save and use it
|
| 475 |
+
manual_tokens = {
|
| 476 |
+
"access_token": env_access,
|
| 477 |
+
"expires_at": time.time() + 3600, # assume 1h validity
|
| 478 |
+
}
|
| 479 |
+
if env_refresh:
|
| 480 |
+
manual_tokens["refresh_token"] = env_refresh
|
| 481 |
+
self.save_tokens(manual_tokens)
|
| 482 |
+
logger.info("AuthGPT: Using access token from AUTHGPT_ACCESS_TOKEN env var")
|
| 483 |
+
return env_access
|
| 484 |
+
if env_refresh:
|
| 485 |
+
# Try refreshing with the provided refresh token
|
| 486 |
+
try:
|
| 487 |
+
refreshed = refresh_access_token(env_refresh)
|
| 488 |
+
self.save_tokens(refreshed)
|
| 489 |
+
logger.info("AuthGPT: Obtained access token via AUTHGPT_REFRESH_TOKEN env var")
|
| 490 |
+
return refreshed["access_token"]
|
| 491 |
+
except Exception as ref_exc:
|
| 492 |
+
raise RuntimeError(
|
| 493 |
+
f"AuthGPT: AUTHGPT_REFRESH_TOKEN was set but refresh failed: {ref_exc}\n"
|
| 494 |
+
"The refresh token may be expired. Please obtain a new one."
|
| 495 |
+
)
|
| 496 |
+
raise RuntimeError(
|
| 497 |
+
"AuthGPT: Browser-based OAuth login is not available in headless environments "
|
| 498 |
+
"(e.g. Hugging Face Spaces, Docker containers).\n"
|
| 499 |
+
"To use AuthGPT models, set one of these as environment secrets:\n"
|
| 500 |
+
" • AUTHGPT_ACCESS_TOKEN — a valid ChatGPT OAuth access token\n"
|
| 501 |
+
" • AUTHGPT_REFRESH_TOKEN — a ChatGPT OAuth refresh token (will auto-refresh)\n"
|
| 502 |
+
"You can obtain these by running the OAuth flow locally first, then copying\n"
|
| 503 |
+
"the tokens from ~/.glossarion/authgpt_tokens.json"
|
| 504 |
+
)
|
| 505 |
+
|
| 506 |
print("🔄 AuthGPT: No valid token found – starting browser login…")
|
| 507 |
new_tokens = run_oauth_flow()
|
| 508 |
self.save_tokens(new_tokens)
|
extract_glossary_from_epub.py
CHANGED
|
@@ -841,8 +841,20 @@ def set_output_redirect(log_callback=None):
|
|
| 841 |
sys.stdout = CallbackWriter(log_callback)
|
| 842 |
|
| 843 |
def load_config(path: str) -> Dict:
|
| 844 |
-
|
| 845 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 846 |
|
| 847 |
# override context_limit_chapters if GUI passed GLOSSARY_CONTEXT_LIMIT
|
| 848 |
env_limit = os.getenv("GLOSSARY_CONTEXT_LIMIT")
|
|
|
|
| 841 |
sys.stdout = CallbackWriter(log_callback)
|
| 842 |
|
| 843 |
def load_config(path: str) -> Dict:
|
| 844 |
+
# Gracefully handle missing config file (e.g. when running from Gradio web UI)
|
| 845 |
+
# Instead of crashing, create a sensible default config from environment variables
|
| 846 |
+
if not path or not os.path.exists(path):
|
| 847 |
+
print(f"[Info] Config file not found at '{path}', using environment variables and defaults")
|
| 848 |
+
cfg = {
|
| 849 |
+
'api_key': os.getenv('API_KEY') or os.getenv('OPENAI_API_KEY') or os.getenv('GEMINI_API_KEY', ''),
|
| 850 |
+
'model': os.getenv('MODEL', 'gemini-2.0-flash'),
|
| 851 |
+
'temperature': 0.1,
|
| 852 |
+
'max_tokens': 65536,
|
| 853 |
+
'context_limit_chapters': 3,
|
| 854 |
+
}
|
| 855 |
+
else:
|
| 856 |
+
with open(path, 'r', encoding='utf-8') as f:
|
| 857 |
+
cfg = json.load(f)
|
| 858 |
|
| 859 |
# override context_limit_chapters if GUI passed GLOSSARY_CONTEXT_LIMIT
|
| 860 |
env_limit = os.getenv("GLOSSARY_CONTEXT_LIMIT")
|