Spaces:

curiouscurrent
/

appliedai

Sleeping

App Files Files Community

curiouscurrent commited on Sep 26, 2025

Commit

789c241

verified ·

1 Parent(s): e023318

Update app.py

Browse files

Files changed (1) hide show

app.py +63 -55

app.py CHANGED Viewed

@@ -9,7 +9,7 @@ from functools import lru_cache
 # CONFIG
 # ----------------------------
 JSON_FILE = "form-submissions-1.json"
-MODEL_ID = "HuggingFaceH4/zephyr-7b-beta"
 HF_API_TOKEN = os.environ.get("HF_API_TOKEN")
 if not HF_API_TOKEN:
@@ -29,13 +29,15 @@ CATEGORIES = {
     "Finance": ["Financial Analyst","Financial Advisor"]
 }
 # ----------------------------
-# LLM caching
 # ----------------------------
 @lru_cache(maxsize=512)
-def call_zephyr_cached(candidate_str, category_name, job_titles_tuple):
-    try:
-        prompt = f"""
 You are an HR assistant. Review this candidate and determine if they are suitable for the category '{category_name}'.
 The category includes the following job titles: {list(job_titles_tuple)}
@@ -43,8 +45,9 @@ Candidate JSON: {candidate_str}
 Respond only 'Yes' if suitable, otherwise 'No'.
 """
-        headers = {"Authorization": f"Bearer {HF_API_TOKEN}", "Content-Type": "application/json"}
-        payload = {"inputs": prompt}
         response = requests.post(
             f"https://api-inference.huggingface.co/models/{MODEL_ID}",
             headers=headers,
@@ -57,13 +60,13 @@ Respond only 'Yes' if suitable, otherwise 'No'.
             return "No"
         return result[0].get("generated_text","No")
     except Exception as e:
-        print("Zephyr call failed:", e)
         return "No"
 # ----------------------------
-# Candidate filtering
 # ----------------------------
-def filter_candidates(category_name, job_titles):
     data = json.load(open(JSON_FILE, encoding="utf-8"))
     filtered = []
     for person in data:
@@ -75,40 +78,51 @@ def filter_candidates(category_name, job_titles):
             continue
         if any(role in job_titles for role in non_fullstack_roles):
             filtered.append(person)
-    print(f"Filtered {len(filtered)} candidates for {category_name}")
     return filtered
-def get_top_candidates(category_name, job_titles, top_n=5):
-    filtered_candidates = filter_candidates(category_name, job_titles)
     recommended = []
-    for person in filtered_candidates:
-        candidate_str = json.dumps(person)
-        response = call_zephyr_cached(candidate_str, category_name, tuple(job_titles))
-        if "Yes" in response:
-            work_exps = person.get("work_experiences", [])
-            non_fullstack_roles = [exp.get("roleName") for exp in work_exps if "full stack developer" not in exp.get("roleName","").lower()]
-            recommended.append({
-                "Name": person.get("name"),
-                "Email": person.get("email"),
-                "Phone": person.get("phone"),
-                "Location": person.get("location"),
-                "Roles": ", ".join(non_fullstack_roles),
-                "Skills": ", ".join(person.get("skills", [])),
-                "Salary": person.get("annual_salary_expectation", {}).get("full-time","N/A"),
-                "Category": category_name
-            })
-    if not recommended:
-        return pd.DataFrame()
-    df = pd.DataFrame(recommended)
-    df["Salary_sort"] = df["Salary"].apply(lambda s: float(s.replace("$","").replace(",","")) if isinstance(s,str) and s.startswith("$") else float('inf'))
-    df = df.sort_values("Salary_sort").drop(columns=["Salary_sort"])
-    return df.head(top_n)
 # ----------------------------
-# Show first 5 candidates from raw JSON
 # ----------------------------
 def show_first_candidates():
     data = json.load(open(JSON_FILE, encoding="utf-8"))
@@ -120,29 +134,23 @@ def show_first_candidates():
 # Gradio interface
 # ----------------------------
 def run_dashboard(category):
-    if category not in CATEGORIES:
         return pd.DataFrame(), None
-    df = get_top_candidates(category, CATEGORIES[category], top_n=5)
-    if df.empty:
-        return df, None
-    file_path = "/tmp/outputs.csv"
-    df.to_csv(file_path, index=False)
-    return df, file_path
-demo = gr.Interface(
-    fn=run_dashboard,
-    inputs=gr.Dropdown(list(CATEGORIES.keys()), label="Select Category"),
-    outputs=[gr.Dataframe(label="Top 5 Recommended Candidates"),
-             gr.File(label="Download CSV")],
-    title="Startup Candidate Dashboard - Zephyr-7B-Beta",
-    description="Top 5 candidates per category using Zephyr LLM. Download CSV available."
-)
-# Add separate interface to show first 5 raw candidates
 with gr.Blocks() as app:
     gr.Markdown("### Raw JSON Preview: First 5 Candidates")
     gr.Dataframe(show_first_candidates(), label="First 5 Candidates from JSON")
     gr.Markdown("---")
     demo.render()
 if __name__ == "__main__":

 # CONFIG
 # ----------------------------
 JSON_FILE = "form-submissions-1.json"
+MODEL_ID = "HuggingFaceH4/sgpt-3.5-mini"  # smaller, faster, stable
 HF_API_TOKEN = os.environ.get("HF_API_TOKEN")
 if not HF_API_TOKEN:
     "Finance": ["Financial Analyst","Financial Advisor"]
 }
+BATCH_SIZE = 50  # send candidates in small batches to LLM
+OUTPUT_FILE = "/tmp/outputs.csv"
 # ----------------------------
+# LLM cached call
 # ----------------------------
 @lru_cache(maxsize=512)
+def call_llm(candidate_str, category_name, job_titles_tuple):
+    prompt = f"""
 You are an HR assistant. Review this candidate and determine if they are suitable for the category '{category_name}'.
 The category includes the following job titles: {list(job_titles_tuple)}
 Respond only 'Yes' if suitable, otherwise 'No'.
 """
+    headers = {"Authorization": f"Bearer {HF_API_TOKEN}", "Content-Type": "application/json"}
+    payload = {"inputs": prompt}
+    try:
         response = requests.post(
             f"https://api-inference.huggingface.co/models/{MODEL_ID}",
             headers=headers,
             return "No"
         return result[0].get("generated_text","No")
     except Exception as e:
+        print("LLM call failed:", e)
         return "No"
 # ----------------------------
+# Pre-filter JSON
 # ----------------------------
+def prefilter_candidates(category_name, job_titles):
     data = json.load(open(JSON_FILE, encoding="utf-8"))
     filtered = []
     for person in data:
             continue
         if any(role in job_titles for role in non_fullstack_roles):
             filtered.append(person)
     return filtered
+# ----------------------------
+# Process batches and save CSV
+# ----------------------------
+def process_category(category_name):
+    job_titles = CATEGORIES[category_name]
+    filtered_candidates = prefilter_candidates(category_name, job_titles)
     recommended = []
+    for i in range(0, len(filtered_candidates), BATCH_SIZE):
+        batch = filtered_candidates[i:i+BATCH_SIZE]
+        for person in batch:
+            candidate_str = json.dumps(person)
+            response = call_llm(candidate_str, category_name, tuple(job_titles))
+            if "Yes" in response:
+                work_exps = person.get("work_experiences", [])
+                non_fullstack_roles = [exp.get("roleName") for exp in work_exps if "full stack developer" not in exp.get("roleName","").lower()]
+                rec = {
+                    "Name": person.get("name"),
+                    "Email": person.get("email"),
+                    "Phone": person.get("phone"),
+                    "Location": person.get("location"),
+                    "Roles": ", ".join(non_fullstack_roles),
+                    "Skills": ", ".join(person.get("skills", [])),
+                    "Salary": person.get("annual_salary_expectation", {}).get("full-time","N/A"),
+                    "Category": category_name
+                }
+                recommended.append(rec)
+        # Incrementally save to CSV
+        if recommended:
+            df_temp = pd.DataFrame(recommended)
+            if os.path.exists(OUTPUT_FILE):
+                df_temp.to_csv(OUTPUT_FILE, mode="a", header=False, index=False)
+            else:
+                df_temp.to_csv(OUTPUT_FILE, index=False)
+    # Read full CSV and return top 5 for this category
+    df_all = pd.read_csv(OUTPUT_FILE)
+    df_category = df_all[df_all["Category"]==category_name]
+    df_category = df_category.sort_values("Salary", ascending=False).head(5)
+    return df_category
 # ----------------------------
+# Show first 5 candidates from JSON
 # ----------------------------
 def show_first_candidates():
     data = json.load(open(JSON_FILE, encoding="utf-8"))
 # Gradio interface
 # ----------------------------
 def run_dashboard(category):
+    df_top5 = process_category(category)
+    if df_top5.empty:
         return pd.DataFrame(), None
+    return df_top5, OUTPUT_FILE
 with gr.Blocks() as app:
     gr.Markdown("### Raw JSON Preview: First 5 Candidates")
     gr.Dataframe(show_first_candidates(), label="First 5 Candidates from JSON")
     gr.Markdown("---")
+    demo = gr.Interface(
+        fn=run_dashboard,
+        inputs=gr.Dropdown(list(CATEGORIES.keys()), label="Select Category"),
+        outputs=[gr.Dataframe(label="Top 5 Recommended Candidates"),
+                 gr.File(label="Download CSV")],
+        title="Startup Candidate Dashboard - Batched LLM",
+        description="Top 5 candidates per category using smaller LLM with batch processing."
+    )
     demo.render()
 if __name__ == "__main__":