Kaito117 commited on
Commit
80fa2cc
·
1 Parent(s): eb51c32

broaden query gen, restrict max search limit to conserve api quota

Browse files
app/core/query_gen.py CHANGED
@@ -14,15 +14,18 @@ You are an expert at writing Google dork queries to find LinkedIn and GitHub pro
14
 
15
  For any job description provided, extract useful keywords like title, location, company, and technical skills.
16
 
 
 
 
17
  Then return one LinkedIn query and one GitHub query in this JSON format:
18
 
19
  {
20
- "linkedin": "site:linkedin.com/in \"job title\" keyword1 keyword2 \"location\" -student -intern -\"recent graduate\"",
21
- "github": "site:github.com \"skill1\" \"skill2\" followers:>10 repos:>5"
22
  }
23
 
24
  Make sure both queries are specific and useful.
25
- Always wrap the job title, location, required keywords (skills) and experience level in double quotes, for exact matching in Google dorks.
26
 
27
  ## Example Input:
28
  Software Engineer, ML Research
@@ -53,10 +56,12 @@ Virtual ML Case Study (1 hour)
53
  Onsite (3 hours): Additional ML case, implementation project, and culture interview
54
  Offer Extended
55
 
 
 
56
  ## Example Output:
57
  {
58
- "linkedin": "site:linkedin.com/in \"ml research engineer\" \"llm\" \"mountain view\" -student -intern -\"recent graduate\"",
59
- "github": "site:github.com \"llm\" \"transformers\" \"pytorch\" followers:>10 repos:>5"
60
  }
61
  """.strip()
62
 
 
14
 
15
  For any job description provided, extract useful keywords like title, location, company, and technical skills.
16
 
17
+ ## IMPORTANT:
18
+ Before you generate anything: if the job title contains commas, only keep the most specific segment (for example, use "ML Research" from "Software Engineer, ML Research")
19
+
20
  Then return one LinkedIn query and one GitHub query in this JSON format:
21
 
22
  {
23
+ "linkedin": "site:linkedin.com/in job title keyword1 keyword2 location -student -intern -recent graduate",
24
+ "github": "site:github.com skill1 skill2 followers:>10 repos:>5"
25
  }
26
 
27
  Make sure both queries are specific and useful.
28
+ For locations, use the city name without quotes to avoid JSON parsing issues.
29
 
30
  ## Example Input:
31
  Software Engineer, ML Research
 
56
  Onsite (3 hours): Additional ML case, implementation project, and culture interview
57
  Offer Extended
58
 
59
+ ### If the job title has multiple parts to it (Software Engineer, ML Research), then select the most appropriate part (ML Research)
60
+
61
  ## Example Output:
62
  {
63
+ "linkedin": "site:linkedin.com/in ml research llm mountain view -student -intern -recent graduate",
64
+ "github": "site:github.com llm transformers pytorch followers:>10 repos:>5"
65
  }
66
  """.strip()
67
 
app/services/search_service.py CHANGED
@@ -22,13 +22,26 @@ class LinkedInSearchService:
22
  if self.serpapi_client:
23
  await self.serpapi_client.__aexit__(exc_type, exc_val, exc_tb)
24
 
25
- async def search_candidates(self, job, max_candidates: int = 50) -> list[dict[str, Any]]:
26
  """Search for LinkedIn candidates based on job description"""
27
  try:
28
  # Generate search queries using existing query generator
29
  prompt_input = job.description or \
30
  f"{job.title} at {job.company}, requirements: {', '.join(job.requirements)}, location: {job.location or ''}"
31
- queries = await self.query_generator.generate_queries(prompt_input)
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  logger.info(f"Search queries generated: {queries}")
33
  linkedin_query = queries.get("linkedin", "")
34
 
@@ -38,15 +51,16 @@ class LinkedInSearchService:
38
 
39
  logger.info(f"Generated LinkedIn query: {linkedin_query}")
40
 
41
- # Search using existing SerpAPI client
42
- search_results = await self.serpapi_client.search(linkedin_query)
43
- organic_results = search_results.get("organic_results", [])
 
44
 
45
  # Extract LinkedIn URLs using existing URL extractor
46
  linkedin_urls = self.url_extractor.extract_linkedin_urls(organic_results)
47
 
48
- # Limit results
49
- linkedin_urls = linkedin_urls[:max_candidates]
50
 
51
  # Convert to simple dict format
52
  candidates = []
@@ -79,14 +93,86 @@ class LinkedInSearchService:
79
 
80
 
81
  class GitHubSearchService:
82
- """Service for searching GitHub profiles (placeholder for future use)"""
 
 
 
 
 
83
 
84
  async def __aenter__(self):
 
 
85
  return self
86
 
87
  async def __aexit__(self, exc_type, exc_val, exc_tb):
88
- pass
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
 
90
- async def search_candidates(self, job, max_candidates: int = 50) -> list[dict[str, Any]]:
91
- """Placeholder for GitHub search"""
92
- return []
 
 
 
 
 
 
 
 
 
 
22
  if self.serpapi_client:
23
  await self.serpapi_client.__aexit__(exc_type, exc_val, exc_tb)
24
 
25
+ async def search_candidates(self, job, max_candidates: int = 2) -> list[dict[str, Any]]:
26
  """Search for LinkedIn candidates based on job description"""
27
  try:
28
  # Generate search queries using existing query generator
29
  prompt_input = job.description or \
30
  f"{job.title} at {job.company}, requirements: {', '.join(job.requirements)}, location: {job.location or ''}"
31
+ try:
32
+ queries = await self.query_generator.generate_queries(prompt_input)
33
+ except Exception as e:
34
+ logger.exception(f"Groq query generation failed: {e}, falling back to default dorks")
35
+ # simple fallback - extract the most specific part of the job title
36
+ title = job.title.replace('"', '')
37
+ # If title contains comma, take the last part (most specific)
38
+ if ',' in title:
39
+ title = title.split(',')[-1].strip()
40
+ loc = (job.location or "").replace('"', '')
41
+ queries = {
42
+ "linkedin": f'site:linkedin.com/in "{title}" "{loc}" -student -intern',
43
+ "github": f'site:github.com "{title}" followers:>10 repos:>5'
44
+ }
45
  logger.info(f"Search queries generated: {queries}")
46
  linkedin_query = queries.get("linkedin", "")
47
 
 
51
 
52
  logger.info(f"Generated LinkedIn query: {linkedin_query}")
53
 
54
+ # Search using existing SerpAPI client - limit to max 5 results to reduce API usage
55
+ search_results = await self.serpapi_client.search(linkedin_query, max_results=2)
56
+ # search_results is already the organic_results list
57
+ organic_results = search_results
58
 
59
  # Extract LinkedIn URLs using existing URL extractor
60
  linkedin_urls = self.url_extractor.extract_linkedin_urls(organic_results)
61
 
62
+ # Results are already limited by the search API call, but ensure max_candidates is respected
63
+ linkedin_urls = linkedin_urls[:min(max_candidates, 2)]
64
 
65
  # Convert to simple dict format
66
  candidates = []
 
93
 
94
 
95
  class GitHubSearchService:
96
+ """Service for searching GitHub profiles"""
97
+
98
+ def __init__(self):
99
+ self.query_generator = SearchQueryGenerator()
100
+ self.url_extractor = URLExtractor()
101
+ self.serpapi_client = None
102
 
103
  async def __aenter__(self):
104
+ self.serpapi_client = SerpAPIClient()
105
+ await self.serpapi_client.__aenter__()
106
  return self
107
 
108
  async def __aexit__(self, exc_type, exc_val, exc_tb):
109
+ if self.serpapi_client:
110
+ await self.serpapi_client.__aexit__(exc_type, exc_val, exc_tb)
111
+
112
+ async def search_candidates(self, job, max_candidates: int = 2) -> list[dict[str, Any]]:
113
+ """Search for GitHub candidates based on job description"""
114
+ try:
115
+ # Generate search queries using existing query generator
116
+ prompt_input = job.description or \
117
+ f"{job.title} at {job.company}, requirements: {', '.join(job.requirements)}, location: {job.location or ''}"
118
+ try:
119
+ queries = await self.query_generator.generate_queries(prompt_input)
120
+ except Exception as e:
121
+ logger.exception(f"Groq query generation failed: {e}, falling back to default dorks")
122
+ # simple fallback - extract the most specific part of the job title
123
+ title = job.title.replace('"', '')
124
+ # If title contains comma, take the last part (most specific)
125
+ if ',' in title:
126
+ title = title.split(',')[-1].strip()
127
+ queries = {
128
+ "linkedin": f'site:linkedin.com/in "{title}" -student -intern',
129
+ "github": f'site:github.com "{title}" followers:>10 repos:>5'
130
+ }
131
+
132
+ github_query = queries.get("github", "")
133
+
134
+ if not github_query:
135
+ logger.error("Failed to generate GitHub search query")
136
+ return []
137
+
138
+ logger.info(f"Generated GitHub query: {github_query}")
139
+
140
+ # Search using existing SerpAPI client - limit to max 5 results to reduce API usage
141
+ search_results = await self.serpapi_client.search(github_query, max_results=5)
142
+ # search_results is already the organic_results list
143
+ organic_results = search_results
144
+
145
+ # Extract GitHub URLs using existing URL extractor
146
+ github_urls = self.url_extractor.extract_github_urls(organic_results)
147
+
148
+ # Results are already limited by the search API call, but ensure max_candidates is respected
149
+ github_urls = github_urls[:min(max_candidates, 2)]
150
+
151
+ # Convert to simple dict format
152
+ candidates = []
153
+ for url in github_urls:
154
+ candidates.append({
155
+ "github_url": url,
156
+ "name": self._extract_name_from_url(url),
157
+ "source": "github_search"
158
+ })
159
+
160
+ logger.info(f"Found {len(candidates)} GitHub candidates")
161
+ return candidates
162
+
163
+ except Exception as e:
164
+ logger.error(f"Error searching GitHub candidates: {str(e)}")
165
+ return []
166
 
167
+ def _extract_name_from_url(self, github_url: str) -> str:
168
+ """Extract basic name from GitHub URL"""
169
+ try:
170
+ # Extract from URL pattern: github.com/username
171
+ parts = github_url.split('github.com/')[-1].split('/')
172
+ if len(parts) >= 1:
173
+ username = parts[0]
174
+ # Convert username to a more readable name
175
+ return username.replace('-', ' ').replace('_', ' ').title()
176
+ return "Unknown"
177
+ except Exception:
178
+ return "Unknown"