Spaces:
No application file
No application file
broaden query gen, restrict max search limit to conserve api quota
Browse files- app/core/query_gen.py +10 -5
- app/services/search_service.py +98 -12
app/core/query_gen.py
CHANGED
|
@@ -14,15 +14,18 @@ You are an expert at writing Google dork queries to find LinkedIn and GitHub pro
|
|
| 14 |
|
| 15 |
For any job description provided, extract useful keywords like title, location, company, and technical skills.
|
| 16 |
|
|
|
|
|
|
|
|
|
|
| 17 |
Then return one LinkedIn query and one GitHub query in this JSON format:
|
| 18 |
|
| 19 |
{
|
| 20 |
-
"linkedin": "site:linkedin.com/in
|
| 21 |
-
"github": "site:github.com
|
| 22 |
}
|
| 23 |
|
| 24 |
Make sure both queries are specific and useful.
|
| 25 |
-
|
| 26 |
|
| 27 |
## Example Input:
|
| 28 |
Software Engineer, ML Research
|
|
@@ -53,10 +56,12 @@ Virtual ML Case Study (1 hour)
|
|
| 53 |
Onsite (3 hours): Additional ML case, implementation project, and culture interview
|
| 54 |
Offer Extended
|
| 55 |
|
|
|
|
|
|
|
| 56 |
## Example Output:
|
| 57 |
{
|
| 58 |
-
"linkedin": "site:linkedin.com/in
|
| 59 |
-
"github": "site:github.com
|
| 60 |
}
|
| 61 |
""".strip()
|
| 62 |
|
|
|
|
| 14 |
|
| 15 |
For any job description provided, extract useful keywords like title, location, company, and technical skills.
|
| 16 |
|
| 17 |
+
## IMPORTANT:
|
| 18 |
+
Before you generate anything: if the job title contains commas, only keep the most specific segment (for example, use "ML Research" from "Software Engineer, ML Research")
|
| 19 |
+
|
| 20 |
Then return one LinkedIn query and one GitHub query in this JSON format:
|
| 21 |
|
| 22 |
{
|
| 23 |
+
"linkedin": "site:linkedin.com/in job title keyword1 keyword2 location -student -intern -recent graduate",
|
| 24 |
+
"github": "site:github.com skill1 skill2 followers:>10 repos:>5"
|
| 25 |
}
|
| 26 |
|
| 27 |
Make sure both queries are specific and useful.
|
| 28 |
+
For locations, use the city name without quotes to avoid JSON parsing issues.
|
| 29 |
|
| 30 |
## Example Input:
|
| 31 |
Software Engineer, ML Research
|
|
|
|
| 56 |
Onsite (3 hours): Additional ML case, implementation project, and culture interview
|
| 57 |
Offer Extended
|
| 58 |
|
| 59 |
+
### If the job title has multiple parts to it (Software Engineer, ML Research), then select the most appropriate part (ML Research)
|
| 60 |
+
|
| 61 |
## Example Output:
|
| 62 |
{
|
| 63 |
+
"linkedin": "site:linkedin.com/in ml research llm mountain view -student -intern -recent graduate",
|
| 64 |
+
"github": "site:github.com llm transformers pytorch followers:>10 repos:>5"
|
| 65 |
}
|
| 66 |
""".strip()
|
| 67 |
|
app/services/search_service.py
CHANGED
|
@@ -22,13 +22,26 @@ class LinkedInSearchService:
|
|
| 22 |
if self.serpapi_client:
|
| 23 |
await self.serpapi_client.__aexit__(exc_type, exc_val, exc_tb)
|
| 24 |
|
| 25 |
-
async def search_candidates(self, job, max_candidates: int =
|
| 26 |
"""Search for LinkedIn candidates based on job description"""
|
| 27 |
try:
|
| 28 |
# Generate search queries using existing query generator
|
| 29 |
prompt_input = job.description or \
|
| 30 |
f"{job.title} at {job.company}, requirements: {', '.join(job.requirements)}, location: {job.location or ''}"
|
| 31 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
logger.info(f"Search queries generated: {queries}")
|
| 33 |
linkedin_query = queries.get("linkedin", "")
|
| 34 |
|
|
@@ -38,15 +51,16 @@ class LinkedInSearchService:
|
|
| 38 |
|
| 39 |
logger.info(f"Generated LinkedIn query: {linkedin_query}")
|
| 40 |
|
| 41 |
-
# Search using existing SerpAPI client
|
| 42 |
-
search_results = await self.serpapi_client.search(linkedin_query)
|
| 43 |
-
|
|
|
|
| 44 |
|
| 45 |
# Extract LinkedIn URLs using existing URL extractor
|
| 46 |
linkedin_urls = self.url_extractor.extract_linkedin_urls(organic_results)
|
| 47 |
|
| 48 |
-
#
|
| 49 |
-
linkedin_urls = linkedin_urls[:max_candidates]
|
| 50 |
|
| 51 |
# Convert to simple dict format
|
| 52 |
candidates = []
|
|
@@ -79,14 +93,86 @@ class LinkedInSearchService:
|
|
| 79 |
|
| 80 |
|
| 81 |
class GitHubSearchService:
|
| 82 |
-
"""Service for searching GitHub profiles
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 83 |
|
| 84 |
async def __aenter__(self):
|
|
|
|
|
|
|
| 85 |
return self
|
| 86 |
|
| 87 |
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
| 88 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 89 |
|
| 90 |
-
|
| 91 |
-
"""
|
| 92 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
if self.serpapi_client:
|
| 23 |
await self.serpapi_client.__aexit__(exc_type, exc_val, exc_tb)
|
| 24 |
|
| 25 |
+
async def search_candidates(self, job, max_candidates: int = 2) -> list[dict[str, Any]]:
|
| 26 |
"""Search for LinkedIn candidates based on job description"""
|
| 27 |
try:
|
| 28 |
# Generate search queries using existing query generator
|
| 29 |
prompt_input = job.description or \
|
| 30 |
f"{job.title} at {job.company}, requirements: {', '.join(job.requirements)}, location: {job.location or ''}"
|
| 31 |
+
try:
|
| 32 |
+
queries = await self.query_generator.generate_queries(prompt_input)
|
| 33 |
+
except Exception as e:
|
| 34 |
+
logger.exception(f"Groq query generation failed: {e}, falling back to default dorks")
|
| 35 |
+
# simple fallback - extract the most specific part of the job title
|
| 36 |
+
title = job.title.replace('"', '')
|
| 37 |
+
# If title contains comma, take the last part (most specific)
|
| 38 |
+
if ',' in title:
|
| 39 |
+
title = title.split(',')[-1].strip()
|
| 40 |
+
loc = (job.location or "").replace('"', '')
|
| 41 |
+
queries = {
|
| 42 |
+
"linkedin": f'site:linkedin.com/in "{title}" "{loc}" -student -intern',
|
| 43 |
+
"github": f'site:github.com "{title}" followers:>10 repos:>5'
|
| 44 |
+
}
|
| 45 |
logger.info(f"Search queries generated: {queries}")
|
| 46 |
linkedin_query = queries.get("linkedin", "")
|
| 47 |
|
|
|
|
| 51 |
|
| 52 |
logger.info(f"Generated LinkedIn query: {linkedin_query}")
|
| 53 |
|
| 54 |
+
# Search using existing SerpAPI client - limit to max 5 results to reduce API usage
|
| 55 |
+
search_results = await self.serpapi_client.search(linkedin_query, max_results=2)
|
| 56 |
+
# search_results is already the organic_results list
|
| 57 |
+
organic_results = search_results
|
| 58 |
|
| 59 |
# Extract LinkedIn URLs using existing URL extractor
|
| 60 |
linkedin_urls = self.url_extractor.extract_linkedin_urls(organic_results)
|
| 61 |
|
| 62 |
+
# Results are already limited by the search API call, but ensure max_candidates is respected
|
| 63 |
+
linkedin_urls = linkedin_urls[:min(max_candidates, 2)]
|
| 64 |
|
| 65 |
# Convert to simple dict format
|
| 66 |
candidates = []
|
|
|
|
| 93 |
|
| 94 |
|
| 95 |
class GitHubSearchService:
|
| 96 |
+
"""Service for searching GitHub profiles"""
|
| 97 |
+
|
| 98 |
+
def __init__(self):
|
| 99 |
+
self.query_generator = SearchQueryGenerator()
|
| 100 |
+
self.url_extractor = URLExtractor()
|
| 101 |
+
self.serpapi_client = None
|
| 102 |
|
| 103 |
async def __aenter__(self):
|
| 104 |
+
self.serpapi_client = SerpAPIClient()
|
| 105 |
+
await self.serpapi_client.__aenter__()
|
| 106 |
return self
|
| 107 |
|
| 108 |
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
| 109 |
+
if self.serpapi_client:
|
| 110 |
+
await self.serpapi_client.__aexit__(exc_type, exc_val, exc_tb)
|
| 111 |
+
|
| 112 |
+
async def search_candidates(self, job, max_candidates: int = 2) -> list[dict[str, Any]]:
|
| 113 |
+
"""Search for GitHub candidates based on job description"""
|
| 114 |
+
try:
|
| 115 |
+
# Generate search queries using existing query generator
|
| 116 |
+
prompt_input = job.description or \
|
| 117 |
+
f"{job.title} at {job.company}, requirements: {', '.join(job.requirements)}, location: {job.location or ''}"
|
| 118 |
+
try:
|
| 119 |
+
queries = await self.query_generator.generate_queries(prompt_input)
|
| 120 |
+
except Exception as e:
|
| 121 |
+
logger.exception(f"Groq query generation failed: {e}, falling back to default dorks")
|
| 122 |
+
# simple fallback - extract the most specific part of the job title
|
| 123 |
+
title = job.title.replace('"', '')
|
| 124 |
+
# If title contains comma, take the last part (most specific)
|
| 125 |
+
if ',' in title:
|
| 126 |
+
title = title.split(',')[-1].strip()
|
| 127 |
+
queries = {
|
| 128 |
+
"linkedin": f'site:linkedin.com/in "{title}" -student -intern',
|
| 129 |
+
"github": f'site:github.com "{title}" followers:>10 repos:>5'
|
| 130 |
+
}
|
| 131 |
+
|
| 132 |
+
github_query = queries.get("github", "")
|
| 133 |
+
|
| 134 |
+
if not github_query:
|
| 135 |
+
logger.error("Failed to generate GitHub search query")
|
| 136 |
+
return []
|
| 137 |
+
|
| 138 |
+
logger.info(f"Generated GitHub query: {github_query}")
|
| 139 |
+
|
| 140 |
+
# Search using existing SerpAPI client - limit to max 5 results to reduce API usage
|
| 141 |
+
search_results = await self.serpapi_client.search(github_query, max_results=5)
|
| 142 |
+
# search_results is already the organic_results list
|
| 143 |
+
organic_results = search_results
|
| 144 |
+
|
| 145 |
+
# Extract GitHub URLs using existing URL extractor
|
| 146 |
+
github_urls = self.url_extractor.extract_github_urls(organic_results)
|
| 147 |
+
|
| 148 |
+
# Results are already limited by the search API call, but ensure max_candidates is respected
|
| 149 |
+
github_urls = github_urls[:min(max_candidates, 2)]
|
| 150 |
+
|
| 151 |
+
# Convert to simple dict format
|
| 152 |
+
candidates = []
|
| 153 |
+
for url in github_urls:
|
| 154 |
+
candidates.append({
|
| 155 |
+
"github_url": url,
|
| 156 |
+
"name": self._extract_name_from_url(url),
|
| 157 |
+
"source": "github_search"
|
| 158 |
+
})
|
| 159 |
+
|
| 160 |
+
logger.info(f"Found {len(candidates)} GitHub candidates")
|
| 161 |
+
return candidates
|
| 162 |
+
|
| 163 |
+
except Exception as e:
|
| 164 |
+
logger.error(f"Error searching GitHub candidates: {str(e)}")
|
| 165 |
+
return []
|
| 166 |
|
| 167 |
+
def _extract_name_from_url(self, github_url: str) -> str:
|
| 168 |
+
"""Extract basic name from GitHub URL"""
|
| 169 |
+
try:
|
| 170 |
+
# Extract from URL pattern: github.com/username
|
| 171 |
+
parts = github_url.split('github.com/')[-1].split('/')
|
| 172 |
+
if len(parts) >= 1:
|
| 173 |
+
username = parts[0]
|
| 174 |
+
# Convert username to a more readable name
|
| 175 |
+
return username.replace('-', ' ').replace('_', ' ').title()
|
| 176 |
+
return "Unknown"
|
| 177 |
+
except Exception:
|
| 178 |
+
return "Unknown"
|