Shirochi commited on
Commit
b9b3c3c
·
verified ·
1 Parent(s): 246fd3a

Upload 4 files

Browse files
Files changed (3) hide show
  1. app.py +45 -13
  2. authgpt_auth.py +44 -0
  3. extract_glossary_from_epub.py +14 -2
app.py CHANGED
@@ -623,7 +623,7 @@ class GlossarionWeb:
623
  os.environ['MANUAL_GLOSSARY_MAX_TITLES'] = str(config('manual_glossary_max_titles', 30))
624
  os.environ['GLOSSARY_MAX_TEXT_SIZE'] = str(config('glossary_max_text_size', 0))
625
  os.environ['GLOSSARY_MAX_SENTENCES'] = str(config('glossary_max_sentences', 200))
626
- os.environ['GLOSSARY_CHAPTER_SPLIT_THRESHOLD'] = str(config('glossary_chapter_split_threshold', 8192))
627
  os.environ['MANUAL_GLOSSARY_FILTER_MODE'] = config('manual_glossary_filter_mode', 'all')
628
  os.environ['STRIP_HONORIFICS'] = '1' if config('strip_honorifics', True) else '0'
629
  os.environ['MANUAL_GLOSSARY_FUZZY_THRESHOLD'] = str(config('manual_glossary_fuzzy_threshold', 0.90))
@@ -711,8 +711,19 @@ class GlossarionWeb:
711
  # Output language
712
  os.environ['OUTPUT_LANGUAGE'] = config('output_language', 'English')
713
 
714
- # Glossary compression
715
- os.environ['COMPRESS_GLOSSARY_PROMPT'] = '1' if config('compress_glossary_prompt', False) else '0'
 
 
 
 
 
 
 
 
 
 
 
716
 
717
  # Additional glossary
718
  os.environ['ADD_ADDITIONAL_GLOSSARY'] = '1' if config('add_additional_glossary', False) else '0'
@@ -1416,6 +1427,7 @@ class GlossarionWeb:
1416
  yield None, None, gr.update(visible=True), "\n".join(extraction_logs), gr.update(visible=True), "Starting...", 0
1417
 
1418
  input_path = epub_file.name if hasattr(epub_file, 'name') else epub_file
 
1419
  output_path = input_path.replace('.epub', '_glossary.csv')
1420
 
1421
  extraction_logs.append(f"📖 Input: {os.path.basename(input_path)}")
@@ -1425,19 +1437,28 @@ class GlossarionWeb:
1425
  # Set all environment variables from config
1426
  self.set_all_environment_variables()
1427
 
1428
- # Set API key
1429
- if 'gpt' in model.lower():
1430
- os.environ['OPENAI_API_KEY'] = api_key
1431
- elif 'claude' in model.lower():
1432
- os.environ['ANTHROPIC_API_KEY'] = api_key
1433
- else:
1434
- os.environ['API_KEY'] = api_key
 
 
 
 
1435
 
1436
  extraction_logs.append("📋 Extracting text from EPUB...")
1437
  yield None, None, gr.update(visible=True), "\n".join(extraction_logs), gr.update(visible=True), "Extracting text...", 20
1438
 
1439
  # Set environment variables for glossary extraction
1440
  os.environ['MODEL'] = model
 
 
 
 
 
1441
  os.environ['GLOSSARY_MIN_FREQUENCY'] = str(min_frequency)
1442
  os.environ['GLOSSARY_MAX_NAMES'] = str(max_names)
1443
  os.environ['GLOSSARY_MAX_TITLES'] = str(max_titles)
@@ -1550,10 +1571,21 @@ class GlossarionWeb:
1550
  extraction_logs.append("🖍️ Writing glossary to CSV...")
1551
  yield None, None, gr.update(visible=True), "\n".join(extraction_logs), gr.update(visible=True), "Writing CSV...", 95
1552
 
1553
- if os.path.exists(output_path):
 
 
 
 
 
 
 
 
 
 
 
1554
  extraction_logs.append(f"✅ Glossary extracted successfully!")
1555
- extraction_logs.append(f"💾 Saved to: {os.path.basename(output_path)}")
1556
- yield output_path, gr.update(visible=True), gr.update(visible=False), "\n".join(extraction_logs), gr.update(visible=True), "Extraction complete!", 100
1557
  else:
1558
  extraction_logs.append("❌ Glossary extraction failed - output file not created")
1559
  yield None, None, gr.update(visible=False), "\n".join(extraction_logs), gr.update(visible=True), "Extraction failed", 0
 
623
  os.environ['MANUAL_GLOSSARY_MAX_TITLES'] = str(config('manual_glossary_max_titles', 30))
624
  os.environ['GLOSSARY_MAX_TEXT_SIZE'] = str(config('glossary_max_text_size', 0))
625
  os.environ['GLOSSARY_MAX_SENTENCES'] = str(config('glossary_max_sentences', 200))
626
+ os.environ['GLOSSARY_CHAPTER_SPLIT_THRESHOLD'] = str(config('glossary_chapter_split_threshold', 0))
627
  os.environ['MANUAL_GLOSSARY_FILTER_MODE'] = config('manual_glossary_filter_mode', 'all')
628
  os.environ['STRIP_HONORIFICS'] = '1' if config('strip_honorifics', True) else '0'
629
  os.environ['MANUAL_GLOSSARY_FUZZY_THRESHOLD'] = str(config('manual_glossary_fuzzy_threshold', 0.90))
 
711
  # Output language
712
  os.environ['OUTPUT_LANGUAGE'] = config('output_language', 'English')
713
 
714
+ # Glossary compression (enabled by default)
715
+ os.environ['COMPRESS_GLOSSARY_PROMPT'] = '1' if config('compress_glossary_prompt', True) else '0'
716
+
717
+ # Dynamic limit expansion (enabled by default)
718
+ os.environ['GLOSSARY_INCLUDE_ALL_CHARACTERS'] = '1' if config('glossary_include_all_characters', True) else '0'
719
+
720
+ # Auto glossary prompt from config (prevents fallback to hard-coded default)
721
+ auto_gloss_prompt = config('unified_auto_glosary_prompt3', '')
722
+ if auto_gloss_prompt:
723
+ os.environ['AUTO_GLOSSARY_PROMPT'] = auto_gloss_prompt
724
+
725
+ # Output token limit
726
+ os.environ['MAX_OUTPUT_TOKENS'] = str(config('max_output_tokens', 128000))
727
 
728
  # Additional glossary
729
  os.environ['ADD_ADDITIONAL_GLOSSARY'] = '1' if config('add_additional_glossary', False) else '0'
 
1427
  yield None, None, gr.update(visible=True), "\n".join(extraction_logs), gr.update(visible=True), "Starting...", 0
1428
 
1429
  input_path = epub_file.name if hasattr(epub_file, 'name') else epub_file
1430
+ output_json_path = input_path.replace('.epub', '_glossary.json')
1431
  output_path = input_path.replace('.epub', '_glossary.csv')
1432
 
1433
  extraction_logs.append(f"📖 Input: {os.path.basename(input_path)}")
 
1437
  # Set all environment variables from config
1438
  self.set_all_environment_variables()
1439
 
1440
+ # Set API key (set all common env vars for unified_api_client compatibility)
1441
+ os.environ['API_KEY'] = api_key
1442
+ os.environ['OPENAI_API_KEY'] = api_key
1443
+ os.environ['OPENAI_OR_Gemini_API_KEY'] = api_key
1444
+ os.environ['GEMINI_API_KEY'] = api_key
1445
+
1446
+ # Enable streaming logs so extraction shows real-time API output
1447
+ os.environ['ENABLE_STREAMING'] = '1'
1448
+ os.environ['LOG_STREAM_CHUNKS'] = '1'
1449
+ os.environ['ALLOW_BATCH_STREAM_LOGS'] = '1'
1450
+ os.environ['ALLOW_AUTHGPT_BATCH_STREAM_LOGS'] = '1'
1451
 
1452
  extraction_logs.append("📋 Extracting text from EPUB...")
1453
  yield None, None, gr.update(visible=True), "\n".join(extraction_logs), gr.update(visible=True), "Extracting text...", 20
1454
 
1455
  # Set environment variables for glossary extraction
1456
  os.environ['MODEL'] = model
1457
+
1458
+ # CRITICAL: Set EPUB_PATH and OUTPUT_PATH for extract_glossary_from_epub.main() GUI mode
1459
+ os.environ['EPUB_PATH'] = input_path
1460
+ os.environ['OUTPUT_PATH'] = output_json_path
1461
+
1462
  os.environ['GLOSSARY_MIN_FREQUENCY'] = str(min_frequency)
1463
  os.environ['GLOSSARY_MAX_NAMES'] = str(max_names)
1464
  os.environ['GLOSSARY_MAX_TITLES'] = str(max_titles)
 
1571
  extraction_logs.append("🖍️ Writing glossary to CSV...")
1572
  yield None, None, gr.update(visible=True), "\n".join(extraction_logs), gr.update(visible=True), "Writing CSV...", 95
1573
 
1574
+ # The extract_glossary_from_epub module saves CSV inside a Glossary/ subfolder
1575
+ glossary_dir = os.path.join(os.path.dirname(output_json_path), "Glossary")
1576
+ glossary_csv_in_subdir = os.path.join(glossary_dir, os.path.basename(output_path))
1577
+
1578
+ # Check multiple possible output locations
1579
+ found_output = None
1580
+ for candidate in [glossary_csv_in_subdir, output_path, output_json_path]:
1581
+ if os.path.exists(candidate):
1582
+ found_output = candidate
1583
+ break
1584
+
1585
+ if found_output:
1586
  extraction_logs.append(f"✅ Glossary extracted successfully!")
1587
+ extraction_logs.append(f"💾 Saved to: {os.path.basename(found_output)}")
1588
+ yield found_output, gr.update(visible=True), gr.update(visible=False), "\n".join(extraction_logs), gr.update(visible=True), "Extraction complete!", 100
1589
  else:
1590
  extraction_logs.append("❌ Glossary extraction failed - output file not created")
1591
  yield None, None, gr.update(visible=False), "\n".join(extraction_logs), gr.update(visible=True), "Extraction failed", 0
authgpt_auth.py CHANGED
@@ -459,6 +459,50 @@ class AuthGPTTokenStore:
459
  "Run the OAuth login flow first."
460
  )
461
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
462
  print("🔄 AuthGPT: No valid token found – starting browser login…")
463
  new_tokens = run_oauth_flow()
464
  self.save_tokens(new_tokens)
 
459
  "Run the OAuth login flow first."
460
  )
461
 
462
+ # Detect headless environments (HF Spaces, Docker, etc.) where browser login is impossible
463
+ is_headless = (
464
+ os.environ.get("SPACE_ID") is not None
465
+ or os.environ.get("HF_SPACES") == "true"
466
+ or os.environ.get("DOCKER_CONTAINER") == "true"
467
+ or os.environ.get("KUBERNETES_SERVICE_HOST") is not None
468
+ )
469
+ if is_headless:
470
+ # Check for manually-provided tokens via environment variables
471
+ env_access = os.environ.get("AUTHGPT_ACCESS_TOKEN", "").strip()
472
+ env_refresh = os.environ.get("AUTHGPT_REFRESH_TOKEN", "").strip()
473
+ if env_access:
474
+ # User provided an access token directly — save and use it
475
+ manual_tokens = {
476
+ "access_token": env_access,
477
+ "expires_at": time.time() + 3600, # assume 1h validity
478
+ }
479
+ if env_refresh:
480
+ manual_tokens["refresh_token"] = env_refresh
481
+ self.save_tokens(manual_tokens)
482
+ logger.info("AuthGPT: Using access token from AUTHGPT_ACCESS_TOKEN env var")
483
+ return env_access
484
+ if env_refresh:
485
+ # Try refreshing with the provided refresh token
486
+ try:
487
+ refreshed = refresh_access_token(env_refresh)
488
+ self.save_tokens(refreshed)
489
+ logger.info("AuthGPT: Obtained access token via AUTHGPT_REFRESH_TOKEN env var")
490
+ return refreshed["access_token"]
491
+ except Exception as ref_exc:
492
+ raise RuntimeError(
493
+ f"AuthGPT: AUTHGPT_REFRESH_TOKEN was set but refresh failed: {ref_exc}\n"
494
+ "The refresh token may be expired. Please obtain a new one."
495
+ )
496
+ raise RuntimeError(
497
+ "AuthGPT: Browser-based OAuth login is not available in headless environments "
498
+ "(e.g. Hugging Face Spaces, Docker containers).\n"
499
+ "To use AuthGPT models, set one of these as environment secrets:\n"
500
+ " • AUTHGPT_ACCESS_TOKEN — a valid ChatGPT OAuth access token\n"
501
+ " • AUTHGPT_REFRESH_TOKEN — a ChatGPT OAuth refresh token (will auto-refresh)\n"
502
+ "You can obtain these by running the OAuth flow locally first, then copying\n"
503
+ "the tokens from ~/.glossarion/authgpt_tokens.json"
504
+ )
505
+
506
  print("🔄 AuthGPT: No valid token found – starting browser login…")
507
  new_tokens = run_oauth_flow()
508
  self.save_tokens(new_tokens)
extract_glossary_from_epub.py CHANGED
@@ -841,8 +841,20 @@ def set_output_redirect(log_callback=None):
841
  sys.stdout = CallbackWriter(log_callback)
842
 
843
  def load_config(path: str) -> Dict:
844
- with open(path, 'r', encoding='utf-8') as f:
845
- cfg = json.load(f)
 
 
 
 
 
 
 
 
 
 
 
 
846
 
847
  # override context_limit_chapters if GUI passed GLOSSARY_CONTEXT_LIMIT
848
  env_limit = os.getenv("GLOSSARY_CONTEXT_LIMIT")
 
841
  sys.stdout = CallbackWriter(log_callback)
842
 
843
  def load_config(path: str) -> Dict:
844
+ # Gracefully handle missing config file (e.g. when running from Gradio web UI)
845
+ # Instead of crashing, create a sensible default config from environment variables
846
+ if not path or not os.path.exists(path):
847
+ print(f"[Info] Config file not found at '{path}', using environment variables and defaults")
848
+ cfg = {
849
+ 'api_key': os.getenv('API_KEY') or os.getenv('OPENAI_API_KEY') or os.getenv('GEMINI_API_KEY', ''),
850
+ 'model': os.getenv('MODEL', 'gemini-2.0-flash'),
851
+ 'temperature': 0.1,
852
+ 'max_tokens': 65536,
853
+ 'context_limit_chapters': 3,
854
+ }
855
+ else:
856
+ with open(path, 'r', encoding='utf-8') as f:
857
+ cfg = json.load(f)
858
 
859
  # override context_limit_chapters if GUI passed GLOSSARY_CONTEXT_LIMIT
860
  env_limit = os.getenv("GLOSSARY_CONTEXT_LIMIT")