Spaces:

Kaito117
/

linkedin_profile_scoring

No application file

App Files Files Community

Kaito117 commited on Jun 30, 2025

Commit

80fa2cc

1 Parent(s): eb51c32

broaden query gen, restrict max search limit to conserve api quota

Browse files

Files changed (2) hide show

app/core/query_gen.py +10 -5
app/services/search_service.py +98 -12

app/core/query_gen.py CHANGED Viewed

@@ -14,15 +14,18 @@ You are an expert at writing Google dork queries to find LinkedIn and GitHub pro
 For any job description provided, extract useful keywords like title, location, company, and technical skills.
 Then return one LinkedIn query and one GitHub query in this JSON format:
 {
-  "linkedin": "site:linkedin.com/in \"job title\" keyword1 keyword2 \"location\" -student -intern -\"recent graduate\"",
-  "github": "site:github.com \"skill1\" \"skill2\" followers:>10 repos:>5"
 }
 Make sure both queries are specific and useful.
-Always wrap the job title, location, required keywords (skills) and experience level in double quotes, for exact matching in Google dorks.
 ## Example Input:
 Software Engineer, ML Research
@@ -53,10 +56,12 @@ Virtual ML Case Study (1 hour)
 Onsite (3 hours): Additional ML case, implementation project, and culture interview
 Offer Extended
 ## Example Output:
 {
-  "linkedin": "site:linkedin.com/in \"ml research engineer\" \"llm\" \"mountain view\" -student -intern -\"recent graduate\"",
-  "github": "site:github.com \"llm\" \"transformers\" \"pytorch\" followers:>10 repos:>5"
 }
 """.strip()

 For any job description provided, extract useful keywords like title, location, company, and technical skills.
+## IMPORTANT:
+Before you generate anything: if the job title contains commas, only keep the most specific segment (for example, use "ML Research" from "Software Engineer, ML Research")
 Then return one LinkedIn query and one GitHub query in this JSON format:
 {
+  "linkedin": "site:linkedin.com/in job title keyword1 keyword2 location -student -intern -recent graduate",
+  "github": "site:github.com skill1 skill2 followers:>10 repos:>5"
 }
 Make sure both queries are specific and useful.
+For locations, use the city name without quotes to avoid JSON parsing issues.
 ## Example Input:
 Software Engineer, ML Research
 Onsite (3 hours): Additional ML case, implementation project, and culture interview
 Offer Extended
+### If the job title has multiple parts to it (Software Engineer, ML Research), then select the most appropriate part (ML Research)
 ## Example Output:
 {
+  "linkedin": "site:linkedin.com/in ml research llm mountain view -student -intern -recent graduate",
+  "github": "site:github.com llm transformers pytorch followers:>10 repos:>5"
 }
 """.strip()

app/services/search_service.py CHANGED Viewed

@@ -22,13 +22,26 @@ class LinkedInSearchService:
         if self.serpapi_client:
             await self.serpapi_client.__aexit__(exc_type, exc_val, exc_tb)
-    async def search_candidates(self, job, max_candidates: int = 50) -> list[dict[str, Any]]:
         """Search for LinkedIn candidates based on job description"""
         try:
             # Generate search queries using existing query generator
             prompt_input = job.description or \
                 f"{job.title} at {job.company}, requirements: {', '.join(job.requirements)}, location: {job.location or ''}"
-            queries = await self.query_generator.generate_queries(prompt_input)
             logger.info(f"Search queries generated: {queries}")
             linkedin_query = queries.get("linkedin", "")
@@ -38,15 +51,16 @@ class LinkedInSearchService:
             logger.info(f"Generated LinkedIn query: {linkedin_query}")
-            # Search using existing SerpAPI client
-            search_results = await self.serpapi_client.search(linkedin_query)
-            organic_results = search_results.get("organic_results", [])
             # Extract LinkedIn URLs using existing URL extractor
             linkedin_urls = self.url_extractor.extract_linkedin_urls(organic_results)
-            # Limit results
-            linkedin_urls = linkedin_urls[:max_candidates]
             # Convert to simple dict format
             candidates = []
@@ -79,14 +93,86 @@ class LinkedInSearchService:
 class GitHubSearchService:
-    """Service for searching GitHub profiles (placeholder for future use)"""
     async def __aenter__(self):
         return self
     async def __aexit__(self, exc_type, exc_val, exc_tb):
-        pass
-    async def search_candidates(self, job, max_candidates: int = 50) -> list[dict[str, Any]]:
-        """Placeholder for GitHub search"""
-        return []

         if self.serpapi_client:
             await self.serpapi_client.__aexit__(exc_type, exc_val, exc_tb)
+    async def search_candidates(self, job, max_candidates: int = 2) -> list[dict[str, Any]]:
         """Search for LinkedIn candidates based on job description"""
         try:
             # Generate search queries using existing query generator
             prompt_input = job.description or \
                 f"{job.title} at {job.company}, requirements: {', '.join(job.requirements)}, location: {job.location or ''}"
+            try:
+                queries = await self.query_generator.generate_queries(prompt_input)
+            except Exception as e:
+               logger.exception(f"Groq query generation failed: {e}, falling back to default dorks")
+               # simple fallback - extract the most specific part of the job title
+               title = job.title.replace('"', '')
+               # If title contains comma, take the last part (most specific)
+               if ',' in title:
+                   title = title.split(',')[-1].strip()
+               loc   = (job.location or "").replace('"', '')
+               queries = {
+                   "linkedin": f'site:linkedin.com/in "{title}" "{loc}" -student -intern',
+                   "github":   f'site:github.com "{title}" followers:>10 repos:>5'
+               }
             logger.info(f"Search queries generated: {queries}")
             linkedin_query = queries.get("linkedin", "")
             logger.info(f"Generated LinkedIn query: {linkedin_query}")
+            # Search using existing SerpAPI client - limit to max 5 results to reduce API usage
+            search_results = await self.serpapi_client.search(linkedin_query, max_results=2)
+            # search_results is already the organic_results list
+            organic_results = search_results
             # Extract LinkedIn URLs using existing URL extractor
             linkedin_urls = self.url_extractor.extract_linkedin_urls(organic_results)
+            # Results are already limited by the search API call, but ensure max_candidates is respected
+            linkedin_urls = linkedin_urls[:min(max_candidates, 2)]
             # Convert to simple dict format
             candidates = []
 class GitHubSearchService:
+    """Service for searching GitHub profiles"""
+    def __init__(self):
+        self.query_generator = SearchQueryGenerator()
+        self.url_extractor = URLExtractor()
+        self.serpapi_client = None
     async def __aenter__(self):
+        self.serpapi_client = SerpAPIClient()
+        await self.serpapi_client.__aenter__()
         return self
     async def __aexit__(self, exc_type, exc_val, exc_tb):
+        if self.serpapi_client:
+            await self.serpapi_client.__aexit__(exc_type, exc_val, exc_tb)
+    async def search_candidates(self, job, max_candidates: int = 2) -> list[dict[str, Any]]:
+        """Search for GitHub candidates based on job description"""
+        try:
+            # Generate search queries using existing query generator
+            prompt_input = job.description or \
+                f"{job.title} at {job.company}, requirements: {', '.join(job.requirements)}, location: {job.location or ''}"
+            try:
+                queries = await self.query_generator.generate_queries(prompt_input)
+            except Exception as e:
+               logger.exception(f"Groq query generation failed: {e}, falling back to default dorks")
+               # simple fallback - extract the most specific part of the job title
+               title = job.title.replace('"', '')
+               # If title contains comma, take the last part (most specific)
+               if ',' in title:
+                   title = title.split(',')[-1].strip()
+               queries = {
+                   "linkedin": f'site:linkedin.com/in "{title}" -student -intern',
+                   "github":   f'site:github.com "{title}" followers:>10 repos:>5'
+               }
+            github_query = queries.get("github", "")
+            if not github_query:
+                logger.error("Failed to generate GitHub search query")
+                return []
+            logger.info(f"Generated GitHub query: {github_query}")
+            # Search using existing SerpAPI client - limit to max 5 results to reduce API usage
+            search_results = await self.serpapi_client.search(github_query, max_results=5)
+            # search_results is already the organic_results list
+            organic_results = search_results
+            # Extract GitHub URLs using existing URL extractor
+            github_urls = self.url_extractor.extract_github_urls(organic_results)
+            # Results are already limited by the search API call, but ensure max_candidates is respected
+            github_urls = github_urls[:min(max_candidates, 2)]
+            # Convert to simple dict format
+            candidates = []
+            for url in github_urls:
+                candidates.append({
+                    "github_url": url,
+                    "name": self._extract_name_from_url(url),
+                    "source": "github_search"
+                })
+            logger.info(f"Found {len(candidates)} GitHub candidates")
+            return candidates
+        except Exception as e:
+            logger.error(f"Error searching GitHub candidates: {str(e)}")
+            return []
+    def _extract_name_from_url(self, github_url: str) -> str:
+        """Extract basic name from GitHub URL"""
+        try:
+            # Extract from URL pattern: github.com/username
+            parts = github_url.split('github.com/')[-1].split('/')
+            if len(parts) >= 1:
+                username = parts[0]
+                # Convert username to a more readable name
+                return username.replace('-', ' ').replace('_', ' ').title()
+            return "Unknown"
+        except Exception:
+            return "Unknown"