Spaces:
Runtime error
Runtime error
Nguyen Quang Truong
commited on
Commit
·
723e191
1
Parent(s):
93a679a
[updates]
Browse files- Knowledge_Graph/.env +4 -0
- Knowledge_Graph/__pycache__/classNode.cpython-310.pyc +0 -0
- Knowledge_Graph/__pycache__/config.cpython-310.pyc +0 -0
- Knowledge_Graph/__pycache__/cypher_utils.cpython-310.pyc +0 -0
- Knowledge_Graph/__pycache__/process_data.cpython-310.pyc +0 -0
- Knowledge_Graph/classNode.py +43 -0
- Knowledge_Graph/config.py +40 -0
- Knowledge_Graph/cypher/count_nodes.cypher +3 -0
- Knowledge_Graph/cypher/count_relationships.cypher +3 -0
- Knowledge_Graph/cypher/delete_all.cypher +5 -0
- Knowledge_Graph/cypher_utils.py +120 -0
- Knowledge_Graph/init.py +19 -0
- Knowledge_Graph/process_data.py +16 -0
- Knowledge_Graph/tempCodeRunnerFile.py +1 -0
- Knowledge_Graph/update_knowledge_graph.py +78 -0
- requirements.txt +4 -0
- scrape_data.py → scrape_data_indeed/scrape_data.py +1 -1
- utils.py → scrape_data_indeed/utils.py +0 -0
Knowledge_Graph/.env
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
NEO4J_URI=neo4j+s://7d728e56.databases.neo4j.io
|
| 2 |
+
NEO4J_USERNAME=neo4j
|
| 3 |
+
NEO4J_PASSWORD=v81MIwaDw3wd3NCcPMpHv4vDc9qAssCkVoYrf6Rk0a0
|
| 4 |
+
GEMINI_API_KEY=AIzaSyDVjpl5kun36J_EdFsuLrwFsgLuPACKh4c
|
Knowledge_Graph/__pycache__/classNode.cpython-310.pyc
ADDED
|
Binary file (3.97 kB). View file
|
|
|
Knowledge_Graph/__pycache__/config.cpython-310.pyc
ADDED
|
Binary file (1.01 kB). View file
|
|
|
Knowledge_Graph/__pycache__/cypher_utils.cpython-310.pyc
ADDED
|
Binary file (3.12 kB). View file
|
|
|
Knowledge_Graph/__pycache__/process_data.cpython-310.pyc
ADDED
|
Binary file (671 Bytes). View file
|
|
|
Knowledge_Graph/classNode.py
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pydantic import BaseModel, Field, Extra
|
| 2 |
+
from typing import Dict, Any, List, Optional, Union
|
| 3 |
+
|
| 4 |
+
class Location(BaseModel):
|
| 5 |
+
name: str = Field(description= "Location name")
|
| 6 |
+
location_type: str | None = Field(description= "Type of location: headquater, office, etc; not a country, city.")
|
| 7 |
+
|
| 8 |
+
class Education(BaseModel):
|
| 9 |
+
name: str = Field(description= "Degree name such as: Bachelor of Science, Master of Engineer, etc.")
|
| 10 |
+
fields: str | None = Field(description= "Fields of study such as: Computer Science, Math, Information Technology, etc.")
|
| 11 |
+
status: str | None = Field(description= "Education status: graduate, ungraduate, etc.")
|
| 12 |
+
|
| 13 |
+
class Skill(BaseModel):
|
| 14 |
+
name: str = Field(description= "Skill name")
|
| 15 |
+
hypernym: str | None = Field(description= "Hypernym of skill")
|
| 16 |
+
|
| 17 |
+
class Work_Exper(BaseModel):
|
| 18 |
+
name: str = Field(description= "Work Experience name")
|
| 19 |
+
duration: Any = Field(description= "Years or months or level of experience")
|
| 20 |
+
|
| 21 |
+
class Work_Level(BaseModel):
|
| 22 |
+
name: str = Field(description= "Work level: intern, senior, lead, CEO, etc.")
|
| 23 |
+
|
| 24 |
+
class Company(BaseModel):
|
| 25 |
+
subdiaries: List[str] | None = Field(description= "Subsidiaries or teams belong to the company. It not, if will not be returned.")
|
| 26 |
+
locations: List[Location] | None = Field(description= "Company headquarter or branches. It not, if will not be returned.")
|
| 27 |
+
industry: List[str] | None = Field(description= "The industry in which the company is doing business")
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
class Job(BaseModel, strict=True):
|
| 31 |
+
description: str = Field(description="Brief summary of what to do when applying for this job.")
|
| 32 |
+
work_at: Location | None = Field(description= "Working location. If not, it will not be returned")
|
| 33 |
+
work_mode: str | None = Field(description= "Work at company (Onsite), Part-time, etc. If not, it will not be returned")
|
| 34 |
+
work_level: Work_Level | None = Field(description= "Word level such as: Intern, Fresher, Junior, etc.")
|
| 35 |
+
education_requirements: List[Education] = Field(description="Education requirements")
|
| 36 |
+
skill_requirements: List[Skill] = Field(description= "Identify and list all the technology skills mentioned. These skills can be specific tools, frameworks, programming languages, or broader categories like 'cloud computing' or 'data science'.")
|
| 37 |
+
work_exper_requirements: List[Work_Exper] = Field(description="Identify the specific years or months of experience required for each position or level of experience (e.g., entry-level, mid-level, senior). If the posting mentions preferred or desired experience, include that information as well.")
|
| 38 |
+
benefit_compensation: str | None = Field(description= "Benefits and compensations include: salary, dayoff, holiday, etc.")
|
| 39 |
+
from_company: Company = Field(description= "The company is recruiting for this job position")
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
class JobKnowledgeGraph(BaseModel):
|
| 43 |
+
job: Job = Field(description= "Knowledge graph about job.")
|
Knowledge_Graph/config.py
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import google.generativeai as genai
|
| 2 |
+
from google.generativeai.types import GenerationConfig
|
| 3 |
+
from langchain_community.graphs import Neo4jGraph
|
| 4 |
+
import instructor
|
| 5 |
+
import os
|
| 6 |
+
from dotenv import load_dotenv
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
config = GenerationConfig(
|
| 11 |
+
temperature=0,
|
| 12 |
+
# max_tokens=128, # Optional: Maximum number of tokens to generate
|
| 13 |
+
# stop_sequences=["<|endoftext|>"] # Optional: Stop generation at these sequences
|
| 14 |
+
)
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def configure_setup():
|
| 18 |
+
load_dotenv()
|
| 19 |
+
|
| 20 |
+
# Set up Neo4J & Gemini API
|
| 21 |
+
os.environ["NEO4J_URI"] = os.getenv("NEO4J_URI")
|
| 22 |
+
os.environ["NEO4J_USERNAME"] = os.getenv("NEO4J_USERNAME")
|
| 23 |
+
os.environ["NEO4J_PASSWORD"] = os.getenv("NEO4J_PASSWORD")
|
| 24 |
+
os.environ["GEMINI_API_KEY"] = os.getenv("GEMINI_API_KEY")
|
| 25 |
+
|
| 26 |
+
neo4j_graph = Neo4jGraph()
|
| 27 |
+
|
| 28 |
+
# Set up Gemini Flash API
|
| 29 |
+
genai.configure(api_key = os.environ["GEMINI_API_KEY"]) # alternative API key configuration
|
| 30 |
+
|
| 31 |
+
# Create Gemini Client
|
| 32 |
+
client = instructor.from_gemini(
|
| 33 |
+
client=genai.GenerativeModel(
|
| 34 |
+
model_name="models/gemini-1.5-flash-latest",
|
| 35 |
+
generation_config= config# model defaults to "gemini-pro"
|
| 36 |
+
),
|
| 37 |
+
mode=instructor.Mode.GEMINI_JSON,
|
| 38 |
+
)
|
| 39 |
+
|
| 40 |
+
return neo4j_graph , client
|
Knowledge_Graph/cypher/count_nodes.cypher
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// Count num nodes
|
| 2 |
+
MATCH (node)
|
| 3 |
+
RETURN COUNT(node) as countNodes
|
Knowledge_Graph/cypher/count_relationships.cypher
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// Count num relationships
|
| 2 |
+
MATCH (n)-[r]->(m)
|
| 3 |
+
RETURN COUNT(r) as countRelationships
|
Knowledge_Graph/cypher/delete_all.cypher
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// Delete all nodes and relationships
|
| 2 |
+
MATCH (m)-[r]->(n)
|
| 3 |
+
MATCH (node)
|
| 4 |
+
DELETE r
|
| 5 |
+
DELETE node
|
Knowledge_Graph/cypher_utils.py
ADDED
|
@@ -0,0 +1,120 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
def add_job_nodes(response, job_name):
|
| 2 |
+
job = response.job
|
| 3 |
+
|
| 4 |
+
# Create job nodes
|
| 5 |
+
cypher = f'''
|
| 6 |
+
CREATE (job:Job {{name: "{job_name}"}})
|
| 7 |
+
'''
|
| 8 |
+
|
| 9 |
+
# Job description
|
| 10 |
+
if job.description:
|
| 11 |
+
cypher += f'SET job.description = "{job.description}"'
|
| 12 |
+
|
| 13 |
+
# Work mode
|
| 14 |
+
if job.work_mode:
|
| 15 |
+
cypher += f'''
|
| 16 |
+
SET job.work_mode = "{job.work_mode}"
|
| 17 |
+
'''
|
| 18 |
+
|
| 19 |
+
# Benefits & Compensations
|
| 20 |
+
if job.benefit_compensation:
|
| 21 |
+
cypher += f'''
|
| 22 |
+
SET job.benefit_compensation = "{job.benefit_compensation}"
|
| 23 |
+
'''
|
| 24 |
+
|
| 25 |
+
# Locations
|
| 26 |
+
if job.work_at:
|
| 27 |
+
cypher += f'''
|
| 28 |
+
MERGE (loc: Location {{name: "{job.work_at.name}", location_type: "{job.work_at.location_type}"}})
|
| 29 |
+
MERGE (job)-[:WORK_AT]->(loc)
|
| 30 |
+
'''
|
| 31 |
+
|
| 32 |
+
# Work Levels
|
| 33 |
+
if job.work_level:
|
| 34 |
+
cypher += f'''
|
| 35 |
+
MERGE (level: Work_LV {{name: "{job.work_level.name}"}})
|
| 36 |
+
MERGE (job)-[:AT_LEVEL]->(level)
|
| 37 |
+
'''
|
| 38 |
+
|
| 39 |
+
# Required educations
|
| 40 |
+
if job.education_requirements:
|
| 41 |
+
for i, edu in enumerate(job.education_requirements):
|
| 42 |
+
cypher += f'''
|
| 43 |
+
CREATE (edu_{i}:Education {{name: "{edu.name}"}})
|
| 44 |
+
MERGE (job)-[:REQUIRES]->(edu_{i})
|
| 45 |
+
'''
|
| 46 |
+
|
| 47 |
+
if edu.fields:
|
| 48 |
+
cypher += f'SET edu_{i}.fields = "{edu.fields}"'
|
| 49 |
+
|
| 50 |
+
if edu.status:
|
| 51 |
+
cypher += f'SET edu_{i}.status = "{edu.status}"'
|
| 52 |
+
|
| 53 |
+
# Required skills
|
| 54 |
+
if job.skill_requirements:
|
| 55 |
+
for i, skill in enumerate(job.skill_requirements):
|
| 56 |
+
cypher += f'''
|
| 57 |
+
MERGE (skill_{i}:Skill {{name: "{skill.name}"}})
|
| 58 |
+
MERGE (job)-[:REQUIRES]->(skill_{i})
|
| 59 |
+
'''
|
| 60 |
+
|
| 61 |
+
if skill.hypernym:
|
| 62 |
+
cypher += f'''
|
| 63 |
+
MERGE (hypernym_{i}:Skill {{name: "{skill.hypernym}"}})
|
| 64 |
+
MERGE (skill_{i})-[:HYPERNYM]->(hypernym_{i})
|
| 65 |
+
'''
|
| 66 |
+
|
| 67 |
+
# Required work experiences
|
| 68 |
+
if job.work_exper_requirements:
|
| 69 |
+
for i, exper in enumerate(job.work_exper_requirements):
|
| 70 |
+
cypher += f'''
|
| 71 |
+
MERGE (exper_{i}:Work_Exper {{name: "{exper.name}"}})
|
| 72 |
+
MERGE (job)-[:REQUIRES]->(exper_{i})
|
| 73 |
+
'''
|
| 74 |
+
|
| 75 |
+
if exper.duration:
|
| 76 |
+
cypher += f'SET exper_{i}.duration = "{exper.duration}"'
|
| 77 |
+
|
| 78 |
+
return cypher
|
| 79 |
+
|
| 80 |
+
def add_company_nodes(response, company_name):
|
| 81 |
+
company = response.job.from_company
|
| 82 |
+
|
| 83 |
+
cypher = f'''
|
| 84 |
+
MERGE (company:Company {{name: "{company_name}"}})
|
| 85 |
+
MERGE (job)-[:FROM]->(company)
|
| 86 |
+
MERGE (company)-[:RECRUITES]->(job)
|
| 87 |
+
'''
|
| 88 |
+
|
| 89 |
+
if company:
|
| 90 |
+
if company.subdiaries:
|
| 91 |
+
for i, sub in enumerate(company.subdiaries):
|
| 92 |
+
cypher += f'''
|
| 93 |
+
MERGE (sub_{i}:Company {{name: "{sub}"}})
|
| 94 |
+
MERGE (company)-[:SUBDIARY]->(sub_{i})
|
| 95 |
+
'''
|
| 96 |
+
|
| 97 |
+
if company.locations:
|
| 98 |
+
for i, loc in enumerate(company.locations):
|
| 99 |
+
cypher += f'''
|
| 100 |
+
MERGE (loc_{i}:Location {{name: "{loc.name}"}})
|
| 101 |
+
MERGE (company)-[:LOCATES_IN]->(loc_{i})
|
| 102 |
+
'''
|
| 103 |
+
|
| 104 |
+
if loc.location_type:
|
| 105 |
+
cypher += f'SET loc_{i}.location_type = "{loc.location_type}"'
|
| 106 |
+
|
| 107 |
+
if company.industry:
|
| 108 |
+
for i, industry in enumerate(company.industry):
|
| 109 |
+
cypher += f'''
|
| 110 |
+
MERGE (industry_{i}:Industry {{name: "{industry}"}})
|
| 111 |
+
MERGE (company)-[:OPERATES_IN]->(industry_{i})
|
| 112 |
+
'''
|
| 113 |
+
|
| 114 |
+
return cypher
|
| 115 |
+
|
| 116 |
+
|
| 117 |
+
def make_cypher_query(response, job_title, company_name):
|
| 118 |
+
job_cypher = add_job_nodes(response, job_title)
|
| 119 |
+
company_cypher = add_company_nodes(response, company_name)
|
| 120 |
+
return job_cypher + company_cypher
|
Knowledge_Graph/init.py
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from dotenv import load_dotenv
|
| 2 |
+
import os
|
| 3 |
+
|
| 4 |
+
load_dotenv()
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
neo4j_uri = os.getenv('NEO4J_URI')
|
| 8 |
+
neo4j_username = os.getenv('NEO4J_USERNAME')
|
| 9 |
+
neo4j_password = os.getenv('NEO4J_PASSWORD')
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
os.environ["NEO4J_URI"] = neo4j_uri
|
| 13 |
+
os.environ["NEO4J_USERNAME"] = neo4j_username
|
| 14 |
+
os.environ["NEO4J_PASSWORD"] = neo4j_password
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
print(f"Neo4j URI: {os.environ['NEO4J_URI']}")
|
| 18 |
+
print(f"Neo4j Username: {os.environ['NEO4J_USERNAME']}")
|
| 19 |
+
print(f"Neo4j Password: {os.environ['NEO4J_PASSWORD']}")
|
Knowledge_Graph/process_data.py
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
|
| 3 |
+
def get_job_desc(filename):
|
| 4 |
+
with open(filename, "r", encoding="utf-8") as file:
|
| 5 |
+
job_posts = json.load(file)
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
for data in job_posts.values():
|
| 9 |
+
job_title, company, job_desc = data["job"], data["company"], data["job_description"]
|
| 10 |
+
yield job_title, company, job_desc
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
if __name__ == "__main__":
|
| 14 |
+
filename = "./data/data_2024_06_23.json"
|
| 15 |
+
for d in get_job_desc(filename):
|
| 16 |
+
print(d)
|
Knowledge_Graph/tempCodeRunnerFile.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
knowledge_graph
|
Knowledge_Graph/update_knowledge_graph.py
ADDED
|
@@ -0,0 +1,78 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from config import configure_setup
|
| 2 |
+
from classNode import JobKnowledgeGraph
|
| 3 |
+
from cypher_utils import make_cypher_query
|
| 4 |
+
from process_data import get_job_desc
|
| 5 |
+
from datetime import date
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
if __name__ == "__main__":
|
| 12 |
+
|
| 13 |
+
knowledge_graph, client = configure_setup()
|
| 14 |
+
|
| 15 |
+
# Example job description
|
| 16 |
+
# with open("jd_example.txt", "r") as file:
|
| 17 |
+
# job_description = file.read()
|
| 18 |
+
#
|
| 19 |
+
|
| 20 |
+
# knowledge_graph.refresh_schema()
|
| 21 |
+
# print(knowledge_graph.schema)
|
| 22 |
+
|
| 23 |
+
with open("Knowledge_Graph/cypher/count_nodes.cypher", "r") as file:
|
| 24 |
+
count_nodes_cypher = file.read()
|
| 25 |
+
|
| 26 |
+
with open("Knowledge_Graph/cypher/count_relationships.cypher", "r") as file:
|
| 27 |
+
count_relations_cypher = file.read()
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
# with open("cypher/delete_all.cypher", "r") as file:
|
| 31 |
+
# delete_cypher = file.read()
|
| 32 |
+
|
| 33 |
+
# knowledge_graph.query(delete_cypher)
|
| 34 |
+
|
| 35 |
+
# filename = f"job_posts_data/job_posts_artificial_intelligence_{str(date.today())}.json"
|
| 36 |
+
filename = f"./data/data_2024_06_23.json"
|
| 37 |
+
|
| 38 |
+
n_processed = 0
|
| 39 |
+
job_desc = get_job_desc(filename)
|
| 40 |
+
for jd_info in job_desc:
|
| 41 |
+
try:
|
| 42 |
+
job_title, company_name, job_desc = jd_info
|
| 43 |
+
job_desc = job_desc.replace('"', "'")
|
| 44 |
+
|
| 45 |
+
system_prompt = f"""
|
| 46 |
+
Help me understand the following by describing it as a detailed knowledge graph.
|
| 47 |
+
Only extract and present only the factual information.
|
| 48 |
+
Always return results in capitalized form
|
| 49 |
+
|
| 50 |
+
Job descriptions: {job_desc}
|
| 51 |
+
"""
|
| 52 |
+
|
| 53 |
+
resp = client.chat.completions.create(
|
| 54 |
+
messages=[
|
| 55 |
+
{
|
| 56 |
+
"role": "user",
|
| 57 |
+
"content": system_prompt
|
| 58 |
+
}
|
| 59 |
+
],
|
| 60 |
+
response_model= JobKnowledgeGraph,
|
| 61 |
+
)
|
| 62 |
+
|
| 63 |
+
cypher = make_cypher_query(resp, job_title, company_name)
|
| 64 |
+
knowledge_graph.query(cypher)
|
| 65 |
+
print(f"Added {job_title} @ {company_name} to Knowledge Graph.")
|
| 66 |
+
|
| 67 |
+
n_processed += 1
|
| 68 |
+
except Exception as e:
|
| 69 |
+
print(e)
|
| 70 |
+
continue
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
print(f"Processed {n_processed} job postings!")
|
| 74 |
+
|
| 75 |
+
num_node = knowledge_graph.query(count_nodes_cypher)
|
| 76 |
+
num_relation = knowledge_graph.query(count_relations_cypher)
|
| 77 |
+
|
| 78 |
+
print(num_node[0], num_relation[0])
|
requirements.txt
CHANGED
|
@@ -6,3 +6,7 @@ numpy
|
|
| 6 |
pandas
|
| 7 |
bs4
|
| 8 |
chromedriver_autoinstaller
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
pandas
|
| 7 |
bs4
|
| 8 |
chromedriver_autoinstaller
|
| 9 |
+
instructor
|
| 10 |
+
langchain_community
|
| 11 |
+
google.generativeai
|
| 12 |
+
neo4j
|
scrape_data.py → scrape_data_indeed/scrape_data.py
RENAMED
|
@@ -2,7 +2,7 @@
|
|
| 2 |
import argparse
|
| 3 |
from selenium.webdriver.edge.options import Options
|
| 4 |
from selenium import webdriver
|
| 5 |
-
from utils import save_data, access, info_job,search, init_driver
|
| 6 |
|
| 7 |
if __name__ == "__main__":
|
| 8 |
parser = argparse.ArgumentParser()
|
|
|
|
| 2 |
import argparse
|
| 3 |
from selenium.webdriver.edge.options import Options
|
| 4 |
from selenium import webdriver
|
| 5 |
+
from scrape_data_indeed.utils import save_data, access, info_job,search, init_driver
|
| 6 |
|
| 7 |
if __name__ == "__main__":
|
| 8 |
parser = argparse.ArgumentParser()
|
utils.py → scrape_data_indeed/utils.py
RENAMED
|
File without changes
|