Spaces:

Shirochi
/

Glossarion

Running

App Files Files Community

Shirochi commited on Mar 3

Commit

b9b3c3c

verified ·

1 Parent(s): 246fd3a

Upload 4 files

Browse files

Files changed (3) hide show

app.py +45 -13
authgpt_auth.py +44 -0
extract_glossary_from_epub.py +14 -2

app.py CHANGED Viewed

@@ -623,7 +623,7 @@ class GlossarionWeb:
         os.environ['MANUAL_GLOSSARY_MAX_TITLES'] = str(config('manual_glossary_max_titles', 30))
         os.environ['GLOSSARY_MAX_TEXT_SIZE'] = str(config('glossary_max_text_size', 0))
         os.environ['GLOSSARY_MAX_SENTENCES'] = str(config('glossary_max_sentences', 200))
-        os.environ['GLOSSARY_CHAPTER_SPLIT_THRESHOLD'] = str(config('glossary_chapter_split_threshold', 8192))
         os.environ['MANUAL_GLOSSARY_FILTER_MODE'] = config('manual_glossary_filter_mode', 'all')
         os.environ['STRIP_HONORIFICS'] = '1' if config('strip_honorifics', True) else '0'
         os.environ['MANUAL_GLOSSARY_FUZZY_THRESHOLD'] = str(config('manual_glossary_fuzzy_threshold', 0.90))
@@ -711,8 +711,19 @@ class GlossarionWeb:
         # Output language
         os.environ['OUTPUT_LANGUAGE'] = config('output_language', 'English')
-        # Glossary compression
-        os.environ['COMPRESS_GLOSSARY_PROMPT'] = '1' if config('compress_glossary_prompt', False) else '0'
         # Additional glossary
         os.environ['ADD_ADDITIONAL_GLOSSARY'] = '1' if config('add_additional_glossary', False) else '0'
@@ -1416,6 +1427,7 @@ class GlossarionWeb:
             yield None, None, gr.update(visible=True), "\n".join(extraction_logs), gr.update(visible=True), "Starting...", 0
             input_path = epub_file.name if hasattr(epub_file, 'name') else epub_file
             output_path = input_path.replace('.epub', '_glossary.csv')
             extraction_logs.append(f"📖 Input: {os.path.basename(input_path)}")
@@ -1425,19 +1437,28 @@ class GlossarionWeb:
             # Set all environment variables from config
             self.set_all_environment_variables()
-            # Set API key
-            if 'gpt' in model.lower():
-                os.environ['OPENAI_API_KEY'] = api_key
-            elif 'claude' in model.lower():
-                os.environ['ANTHROPIC_API_KEY'] = api_key
-            else:
-                os.environ['API_KEY'] = api_key
             extraction_logs.append("📋 Extracting text from EPUB...")
             yield None, None, gr.update(visible=True), "\n".join(extraction_logs), gr.update(visible=True), "Extracting text...", 20
             # Set environment variables for glossary extraction
             os.environ['MODEL'] = model
             os.environ['GLOSSARY_MIN_FREQUENCY'] = str(min_frequency)
             os.environ['GLOSSARY_MAX_NAMES'] = str(max_names)
             os.environ['GLOSSARY_MAX_TITLES'] = str(max_titles)
@@ -1550,10 +1571,21 @@ class GlossarionWeb:
             extraction_logs.append("🖍️ Writing glossary to CSV...")
             yield None, None, gr.update(visible=True), "\n".join(extraction_logs), gr.update(visible=True), "Writing CSV...", 95
-            if os.path.exists(output_path):
                 extraction_logs.append(f"✅ Glossary extracted successfully!")
-                extraction_logs.append(f"💾 Saved to: {os.path.basename(output_path)}")
-                yield output_path, gr.update(visible=True), gr.update(visible=False), "\n".join(extraction_logs), gr.update(visible=True), "Extraction complete!", 100
             else:
                 extraction_logs.append("❌ Glossary extraction failed - output file not created")
                 yield None, None, gr.update(visible=False), "\n".join(extraction_logs), gr.update(visible=True), "Extraction failed", 0

         os.environ['MANUAL_GLOSSARY_MAX_TITLES'] = str(config('manual_glossary_max_titles', 30))
         os.environ['GLOSSARY_MAX_TEXT_SIZE'] = str(config('glossary_max_text_size', 0))
         os.environ['GLOSSARY_MAX_SENTENCES'] = str(config('glossary_max_sentences', 200))
+        os.environ['GLOSSARY_CHAPTER_SPLIT_THRESHOLD'] = str(config('glossary_chapter_split_threshold', 0))
         os.environ['MANUAL_GLOSSARY_FILTER_MODE'] = config('manual_glossary_filter_mode', 'all')
         os.environ['STRIP_HONORIFICS'] = '1' if config('strip_honorifics', True) else '0'
         os.environ['MANUAL_GLOSSARY_FUZZY_THRESHOLD'] = str(config('manual_glossary_fuzzy_threshold', 0.90))
         # Output language
         os.environ['OUTPUT_LANGUAGE'] = config('output_language', 'English')
+        # Glossary compression (enabled by default)
+        os.environ['COMPRESS_GLOSSARY_PROMPT'] = '1' if config('compress_glossary_prompt', True) else '0'
+        # Dynamic limit expansion (enabled by default)
+        os.environ['GLOSSARY_INCLUDE_ALL_CHARACTERS'] = '1' if config('glossary_include_all_characters', True) else '0'
+        # Auto glossary prompt from config (prevents fallback to hard-coded default)
+        auto_gloss_prompt = config('unified_auto_glosary_prompt3', '')
+        if auto_gloss_prompt:
+            os.environ['AUTO_GLOSSARY_PROMPT'] = auto_gloss_prompt
+        # Output token limit
+        os.environ['MAX_OUTPUT_TOKENS'] = str(config('max_output_tokens', 128000))
         # Additional glossary
         os.environ['ADD_ADDITIONAL_GLOSSARY'] = '1' if config('add_additional_glossary', False) else '0'
             yield None, None, gr.update(visible=True), "\n".join(extraction_logs), gr.update(visible=True), "Starting...", 0
             input_path = epub_file.name if hasattr(epub_file, 'name') else epub_file
+            output_json_path = input_path.replace('.epub', '_glossary.json')
             output_path = input_path.replace('.epub', '_glossary.csv')
             extraction_logs.append(f"📖 Input: {os.path.basename(input_path)}")
             # Set all environment variables from config
             self.set_all_environment_variables()
+            # Set API key (set all common env vars for unified_api_client compatibility)
+            os.environ['API_KEY'] = api_key
+            os.environ['OPENAI_API_KEY'] = api_key
+            os.environ['OPENAI_OR_Gemini_API_KEY'] = api_key
+            os.environ['GEMINI_API_KEY'] = api_key
+            # Enable streaming logs so extraction shows real-time API output
+            os.environ['ENABLE_STREAMING'] = '1'
+            os.environ['LOG_STREAM_CHUNKS'] = '1'
+            os.environ['ALLOW_BATCH_STREAM_LOGS'] = '1'
+            os.environ['ALLOW_AUTHGPT_BATCH_STREAM_LOGS'] = '1'
             extraction_logs.append("📋 Extracting text from EPUB...")
             yield None, None, gr.update(visible=True), "\n".join(extraction_logs), gr.update(visible=True), "Extracting text...", 20
             # Set environment variables for glossary extraction
             os.environ['MODEL'] = model
+            # CRITICAL: Set EPUB_PATH and OUTPUT_PATH for extract_glossary_from_epub.main() GUI mode
+            os.environ['EPUB_PATH'] = input_path
+            os.environ['OUTPUT_PATH'] = output_json_path
             os.environ['GLOSSARY_MIN_FREQUENCY'] = str(min_frequency)
             os.environ['GLOSSARY_MAX_NAMES'] = str(max_names)
             os.environ['GLOSSARY_MAX_TITLES'] = str(max_titles)
             extraction_logs.append("🖍️ Writing glossary to CSV...")
             yield None, None, gr.update(visible=True), "\n".join(extraction_logs), gr.update(visible=True), "Writing CSV...", 95
+            # The extract_glossary_from_epub module saves CSV inside a Glossary/ subfolder
+            glossary_dir = os.path.join(os.path.dirname(output_json_path), "Glossary")
+            glossary_csv_in_subdir = os.path.join(glossary_dir, os.path.basename(output_path))
+            # Check multiple possible output locations
+            found_output = None
+            for candidate in [glossary_csv_in_subdir, output_path, output_json_path]:
+                if os.path.exists(candidate):
+                    found_output = candidate
+                    break
+            if found_output:
                 extraction_logs.append(f"✅ Glossary extracted successfully!")
+                extraction_logs.append(f"💾 Saved to: {os.path.basename(found_output)}")
+                yield found_output, gr.update(visible=True), gr.update(visible=False), "\n".join(extraction_logs), gr.update(visible=True), "Extraction complete!", 100
             else:
                 extraction_logs.append("❌ Glossary extraction failed - output file not created")
                 yield None, None, gr.update(visible=False), "\n".join(extraction_logs), gr.update(visible=True), "Extraction failed", 0

authgpt_auth.py CHANGED Viewed

@@ -459,6 +459,50 @@ class AuthGPTTokenStore:
                     "Run the OAuth login flow first."
                 )
             print("🔄 AuthGPT: No valid token found – starting browser login…")
             new_tokens = run_oauth_flow()
             self.save_tokens(new_tokens)

                     "Run the OAuth login flow first."
                 )
+            # Detect headless environments (HF Spaces, Docker, etc.) where browser login is impossible
+            is_headless = (
+                os.environ.get("SPACE_ID") is not None
+                or os.environ.get("HF_SPACES") == "true"
+                or os.environ.get("DOCKER_CONTAINER") == "true"
+                or os.environ.get("KUBERNETES_SERVICE_HOST") is not None
+            )
+            if is_headless:
+                # Check for manually-provided tokens via environment variables
+                env_access = os.environ.get("AUTHGPT_ACCESS_TOKEN", "").strip()
+                env_refresh = os.environ.get("AUTHGPT_REFRESH_TOKEN", "").strip()
+                if env_access:
+                    # User provided an access token directly — save and use it
+                    manual_tokens = {
+                        "access_token": env_access,
+                        "expires_at": time.time() + 3600,  # assume 1h validity
+                    }
+                    if env_refresh:
+                        manual_tokens["refresh_token"] = env_refresh
+                    self.save_tokens(manual_tokens)
+                    logger.info("AuthGPT: Using access token from AUTHGPT_ACCESS_TOKEN env var")
+                    return env_access
+                if env_refresh:
+                    # Try refreshing with the provided refresh token
+                    try:
+                        refreshed = refresh_access_token(env_refresh)
+                        self.save_tokens(refreshed)
+                        logger.info("AuthGPT: Obtained access token via AUTHGPT_REFRESH_TOKEN env var")
+                        return refreshed["access_token"]
+                    except Exception as ref_exc:
+                        raise RuntimeError(
+                            f"AuthGPT: AUTHGPT_REFRESH_TOKEN was set but refresh failed: {ref_exc}\n"
+                            "The refresh token may be expired. Please obtain a new one."
+                        )
+                raise RuntimeError(
+                    "AuthGPT: Browser-based OAuth login is not available in headless environments "
+                    "(e.g. Hugging Face Spaces, Docker containers).\n"
+                    "To use AuthGPT models, set one of these as environment secrets:\n"
+                    "  • AUTHGPT_ACCESS_TOKEN — a valid ChatGPT OAuth access token\n"
+                    "  • AUTHGPT_REFRESH_TOKEN — a ChatGPT OAuth refresh token (will auto-refresh)\n"
+                    "You can obtain these by running the OAuth flow locally first, then copying\n"
+                    "the tokens from ~/.glossarion/authgpt_tokens.json"
+                )
             print("🔄 AuthGPT: No valid token found – starting browser login…")
             new_tokens = run_oauth_flow()
             self.save_tokens(new_tokens)

extract_glossary_from_epub.py CHANGED Viewed

@@ -841,8 +841,20 @@ def set_output_redirect(log_callback=None):
         sys.stdout = CallbackWriter(log_callback)
 def load_config(path: str) -> Dict:
-    with open(path, 'r', encoding='utf-8') as f:
-        cfg = json.load(f)
     # override context_limit_chapters if GUI passed GLOSSARY_CONTEXT_LIMIT
     env_limit = os.getenv("GLOSSARY_CONTEXT_LIMIT")

         sys.stdout = CallbackWriter(log_callback)
 def load_config(path: str) -> Dict:
+    # Gracefully handle missing config file (e.g. when running from Gradio web UI)
+    # Instead of crashing, create a sensible default config from environment variables
+    if not path or not os.path.exists(path):
+        print(f"[Info] Config file not found at '{path}', using environment variables and defaults")
+        cfg = {
+            'api_key': os.getenv('API_KEY') or os.getenv('OPENAI_API_KEY') or os.getenv('GEMINI_API_KEY', ''),
+            'model': os.getenv('MODEL', 'gemini-2.0-flash'),
+            'temperature': 0.1,
+            'max_tokens': 65536,
+            'context_limit_chapters': 3,
+        }
+    else:
+        with open(path, 'r', encoding='utf-8') as f:
+            cfg = json.load(f)
     # override context_limit_chapters if GUI passed GLOSSARY_CONTEXT_LIMIT
     env_limit = os.getenv("GLOSSARY_CONTEXT_LIMIT")