Spaces:

mkegel
/

post-n-RAG

Runtime error

App Files Files Community

MarlonKegel commited on May 27, 2025

Commit

f291a48

1 Parent(s): 8be5285

adjusted output token settings and cost estimates

Browse files

Files changed (2) hide show

README.md +1 -1
rag_ui.py +35 -25

README.md CHANGED Viewed

@@ -1,5 +1,5 @@
 ---
-title: Post-Neoliberalism LLM
 emoji: 📚
 colorFrom: blue
 colorTo: green

 ---
+title: Post-Neoliberalism Literature RAG
 emoji: 📚
 colorFrom: blue
 colorTo: green

rag_ui.py CHANGED Viewed

@@ -275,7 +275,7 @@ retrieval_col, llm_col = st.columns(2)
 with retrieval_col:
     st.subheader("Retrieval Settings")
     selected_labels = st.multiselect(
-        "Pick one, multiple, or none (default: search *all* sources):",
         source_labels,
         default=[]
     )
@@ -286,7 +286,7 @@ with retrieval_col:
         chunk_idx_pool = [i for key in selected_keys for i in source_groups[key]]
     context_chunk_count = st.number_input(
-        "How many relevant chunks to include:",
         min_value=3,
         max_value=30,
         value=15,
@@ -297,25 +297,26 @@ with llm_col:
     st.subheader("LLM Settings")
     selected_model_name = st.selectbox("Choose an OpenAI model:", model_friendly_names, index=0)
     selected_model = model_label_map[selected_model_name]
     # Max output tokens UI -- show as "words"
     max_output_words = st.number_input(
-        "Max answer length (in words):",
         min_value=50,
         max_value=2000,
         value=800,
         step=50
     )
-    # We'll later convert words to tokens for API (rough rule: words × 1.5 = tokens)
     # Advanced controls:
     with st.expander("Advanced LLM Controls (Optional)"):
         temp_value = st.slider(
             "Model randomness (temperature): Lower = more deterministic outputs (only GPT-4.1 and 4.1-mini)",
             0.0, 0.5, value=0.3, step=0.05,
             disabled=selected_model not in TEMPERATURE_MODELS,
             key="temperature_slider"
         )
         reasoning_effort = st.selectbox(
             "Reasoning effort (only for o3 and o4-mini):",
             ["default", "low", "medium", "high"],
@@ -323,27 +324,36 @@ with llm_col:
             disabled=selected_model not in REASONING_MODELS,
             key="reasoning_effort"
         )
-        if selected_model not in TEMPERATURE_MODELS:
-            st.caption("Temperature is only used for GPT-4.1 and GPT-4.1-mini.")
-        if selected_model not in REASONING_MODELS:
-            st.caption("Reasoning effort is only used for o3 and o4-mini.")
     user_temperature = float(temp_value)
     user_reasoning = reasoning_effort if reasoning_effort != "default" else None
-    # Convert words to tokens for API call
-    user_max_output_tokens = int(max_output_words * 1.5)
-# --- Pricing estimate (dollars only) ---
-# Estimation: input tokens = query + all chunks, output tokens = as set
-chunk_token = 750  # keeping for estimate, ~500-600 words per chunk = 750 tokens
-input_tok = context_chunk_count * chunk_token + len(question.split()) * 1.3 + 1800
-output_tok = user_max_output_tokens
-rates = MODEL_PRICING[selected_model]
-input_cost = (input_tok / 1_000_000) * rates["input"]
-output_cost = (output_tok / 1_000_000) * rates["output"]
-total_cost = input_cost + output_cost
-st.info(f"**API cost estimate for this query:** ${total_cost:.5f}")
 if ask_clicked and question.strip():
     with st.spinner("Retrieving and generating answer..."):

 with retrieval_col:
     st.subheader("Retrieval Settings")
     selected_labels = st.multiselect(
+        "Select sources to search (default is _all_):",
         source_labels,
         default=[]
     )
         chunk_idx_pool = [i for key in selected_keys for i in source_groups[key]]
     context_chunk_count = st.number_input(
+        "Number of chunks passed on to the LLM:",
         min_value=3,
         max_value=30,
         value=15,
     st.subheader("LLM Settings")
     selected_model_name = st.selectbox("Choose an OpenAI model:", model_friendly_names, index=0)
     selected_model = model_label_map[selected_model_name]
     # Max output tokens UI -- show as "words"
     max_output_words = st.number_input(
+        "Max response length (# of words):",
         min_value=50,
         max_value=2000,
         value=800,
         step=50
     )
     # Advanced controls:
     with st.expander("Advanced LLM Controls (Optional)"):
+        if selected_model not in TEMPERATURE_MODELS:
+            st.caption("Temperature is only used for GPT-4.1 and GPT-4.1-mini.")
         temp_value = st.slider(
             "Model randomness (temperature): Lower = more deterministic outputs (only GPT-4.1 and 4.1-mini)",
             0.0, 0.5, value=0.3, step=0.05,
             disabled=selected_model not in TEMPERATURE_MODELS,
             key="temperature_slider"
         )
+        if selected_model not in REASONING_MODELS:
+            st.caption("Reasoning effort is only used for o3 and o4-mini.")
         reasoning_effort = st.selectbox(
             "Reasoning effort (only for o3 and o4-mini):",
             ["default", "low", "medium", "high"],
             disabled=selected_model not in REASONING_MODELS,
             key="reasoning_effort"
         )
     user_temperature = float(temp_value)
     user_reasoning = reasoning_effort if reasoning_effort != "default" else None
+    # Convert words to tokens for API call (model-aware token multiplier)
+    if selected_model in REASONING_MODELS:
+        if user_reasoning == "low":
+            output_token_multiplier = 7
+        elif user_reasoning == "medium" or user_reasoning is None:
+            output_token_multiplier = 12
+        elif user_reasoning == "high":
+            output_token_multiplier = 18
+        else:
+            output_token_multiplier = 12  # default
+    else:
+        output_token_multiplier = 1.5
+    user_max_output_tokens = int(max_output_words * output_token_multiplier)
+    # --- Pricing estimate (dollars only) ---
+    chunk_token = 750  # ~500-600 words per chunk ≈ 750 tokens
+    input_tok = context_chunk_count * chunk_token + len(question.split()) * 1.3 + 1800
+    output_tok = user_max_output_tokens
+    rates = MODEL_PRICING[selected_model]
+    input_cost = (input_tok / 1_000_000) * rates["input"]
+    output_cost = (output_tok / 1_000_000) * rates["output"]
+    total_cost = input_cost + output_cost
+    # Show price estimate, turn red if over $1
+    if total_cost > 1:
+        st.error(f"**API cost estimate for this query:** ${total_cost:.5f}")
+    else:
+        st.info(f"**API cost estimate for this query:** ${total_cost:.5f}")
 if ask_clicked and question.strip():
     with st.spinner("Retrieving and generating answer..."):