Spaces:

ABAO77
/

Recruitment

Sleeping

App Files Files Community

ABAO77 commited on Dec 25, 2024

Commit

f01124b

verified ·

1 Parent(s): e00e339

Upload 14 files

Browse files

Files changed (14) hide show

Dockerfile +16 -0
app.py +71 -0
config/.DS_Store +0 -0
config/__pycache__/database.cpython-311.pyc +0 -0
config/__pycache__/llm.cpython-311.pyc +0 -0
config/database.py +9 -0
config/llm.py +23 -0
graph.py +89 -0
graph_function.py +137 -0
helper.py +9 -0
job_data.xlsx +0 -0
logger.py +64 -0
prompt.py +207 -0
requirments.txt +3 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,16 @@

+# Read the doc: https://huggingface.co/docs/hub/spaces-sdks-docker
+# you will also find guides on how best to write your Dockerfile
+FROM python:3.11
+RUN useradd -m -u 1000 user
+USER user
+ENV PATH="/home/user/.local/bin:$PATH"
+WORKDIR /app
+COPY --chown=user ./requirements.txt requirements.txt
+RUN pip install --no-cache-dir --upgrade -r requirements.txt
+COPY --chown=user . /app
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

app.py ADDED Viewed

	@@ -0,0 +1,71 @@

+from fastapi import FastAPI
+from pydantic import BaseModel, Field
+from typing import Optional
+from fastapi.middleware.cors import CORSMiddleware
+from graph import app as workflow
+from fastapi.responses import JSONResponse
+from langchain_core.messages import HumanMessage, AIMessage
+app = FastAPI(docs_url="/")
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+class Item(BaseModel):
+    messages: list = Field(..., description="List of messages")
+    language: Optional[str] = Field("vi", description="Language of the messages")
+    class Config:
+        json_schema_extra = {
+            "example": {
+                "messages": [
+                    {"type": "human", "content": "Chào bạn"},
+                    {"type": "ai", "content": "Bạn muốn tìm job gì?"},
+                    {
+                        "type": "human",
+                        "content": "Tôi muốn tìm job dạy học cho trung cấp",
+                    },
+                ],
+                "language": "vi",
+            }
+        }
+def convert_message(messages):
+    list_message = []
+    for message in messages:
+        if message["type"] == "human":
+            list_message.append(HumanMessage(content=message["content"]))
+        else:
+            list_message.append(AIMessage(content=message["content"]))
+    return list_message
+@app.post("/chat")
+async def Chat(item: Item):
+    messages = convert_message(item.messages)
+    history = messages[:-1] if len(messages) > 1 else []
+    try:
+        response = workflow.invoke(
+            {
+                "user_query": messages[-1],
+                "messages_history": history,
+                "language": item.language,
+            }
+        )["llm_response"]
+        return JSONResponse(content=response, status_code=200)
+    except Exception as e:
+        return JSONResponse(content={"error": str(e)}, status_code=500)
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run("app:app")

config/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

config/__pycache__/database.cpython-311.pyc ADDED Viewed

Binary file (542 Bytes). View file

config/__pycache__/llm.cpython-311.pyc ADDED Viewed

Binary file (808 Bytes). View file

config/database.py ADDED Viewed

	@@ -0,0 +1,9 @@

+from langchain_pinecone import PineconeVectorStore
+from .llm import embeddings
+import os
+API_PINCONE_KEY = os.getenv("PIPE_CONE")
+index = "recruiment-new"
+vector_store = PineconeVectorStore(
+    index_name=index, embedding=embeddings, pinecone_api_key=API_PINCONE_KEY
+)

config/llm.py ADDED Viewed

	@@ -0,0 +1,23 @@

+from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
+import os
+API_KEY_1 = os.getenv("API_KEY_1")
+API_KEY_2 = os.getenv("API_KEY_2")
+llm_1 = ChatGoogleGenerativeAI(
+    model="gemini-1.5-flash",
+    temperature=0,
+    max_tokens=None,
+    timeout=None,
+    max_retries=2,
+    api_key=API_KEY_1,
+)
+llm_2 = ChatGoogleGenerativeAI(
+    model="gemini-1.5-flash",
+    temperature=0,
+    max_tokens=None,
+    timeout=None,
+    max_retries=2,
+    api_key=API_KEY_2,
+)
+embeddings = GoogleGenerativeAIEmbeddings(
+    model="models/text-embedding-004", google_api_key=API_KEY_1
+)

graph.py ADDED Viewed

	@@ -0,0 +1,89 @@

+from langgraph.graph import StateGraph, START, END
+from graph_function import (
+    route_fn,
+    transform_query_fn,
+    retrieve_document_fn,
+    grade_document_fn,
+    gen_answer_normal_fn,
+    grade_hallucinations_fn,
+    generate_answer_rag_fn,
+    State,
+)
+workflow = StateGraph(State)
+workflow.add_node("routing", route_fn)
+workflow.add_node("transform_query", transform_query_fn)
+workflow.add_node("retrieve_document", retrieve_document_fn)
+workflow.add_node("grade_document", grade_document_fn)
+workflow.add_node("generate_answer_rag", generate_answer_rag_fn)
+workflow.add_node("grade_hallucinations", grade_hallucinations_fn)
+workflow.add_node("generate_answer_normal", gen_answer_normal_fn)
+workflow.add_edge(START, "routing")
+def routing_after_route(state: State):
+    if state["route"] == "vectorstore":
+        return "transform_query"
+    else:
+        return "generate_answer_normal"
+workflow.add_conditional_edges(
+    "routing",
+    routing_after_route,
+    {
+        "transform_query": "transform_query",
+        "generate_answer_normal": "generate_answer_normal",
+    },
+)
+workflow.add_edge("transform_query", "retrieve_document")
+def routing_after_retrieve_document(state: State):
+    return "grade_document" if len(state["documents"]) != 0 else "generate_answer_normal"
+workflow.add_conditional_edges(
+    "retrieve_document",
+    routing_after_retrieve_document,
+    {
+        "grade_document": "grade_document",
+        "generate_answer_normal": "generate_answer_normal",
+    },
+)
+def route_after_grade_document(state: State):
+    return (
+        "generate_answer_rag"
+        if len(state["documents"]) != 0
+        else "generate_answer_normal"
+    )
+workflow.add_conditional_edges(
+    "grade_document",
+    route_after_grade_document,
+    {
+        "generate_answer_rag": "generate_answer_rag",
+        "generate_answer_normal": "generate_answer_normal",
+    },
+)
+workflow.add_edge("generate_answer_rag", "grade_hallucinations")
+def routing_check_pass_grade_hallucinations(state: State):
+    return END if state["grade_response"] == "yes" else "generate_answer_normal"
+workflow.add_conditional_edges(
+    "grade_hallucinations",
+    routing_check_pass_grade_hallucinations,
+    {
+        END: END,
+        "generate_answer_normal": "generate_answer_normal",
+    },
+)
+workflow.add_edge("generate_answer_normal", END)
+app = workflow.compile()

graph_function.py ADDED Viewed

	@@ -0,0 +1,137 @@

+from dotenv import load_dotenv
+load_dotenv()
+from prompt import (
+    RouteQuery,
+    GradeDocuments,
+    GenerateAnswer,
+    GradeHallucinations,
+    ExtractFilter,
+    route_chain,
+    transform_query_chain,
+    grade_documents_chain,
+    gen_normal_answer_chain,
+    gen_answer_rag_chain,
+    grade_hallucinations_chain,
+    extract_filter_chain,
+)
+from config.database import vector_store
+from langchain_core.documents import Document
+from prompt import GradeDocuments
+from helper import convert_list_context_source_to_str
+from logger import logger
+from langgraph.graph.message import AnyMessage, add_messages
+from typing import TypedDict, Literal
+class State(TypedDict):
+    user_query: AnyMessage
+    route: str
+    messages_history: list
+    documents: list[Document]
+    filter: dict
+    llm_response: AnyMessage
+    grade_response: Literal["yes", "no"]
+    language: str
+    document_id_selected: str
+def route_fn(state: State):
+    question = state["user_query"].content
+    route_response: RouteQuery = route_chain.invoke({"question": question})
+    logger.info(f"Route response: {route_response}")
+    return {"route": route_response.datasource}
+def transform_query_fn(state: State):
+    question = state["user_query"].content
+    chat_history = state["messages_history"]
+    transform_response = transform_query_chain.invoke(
+        {"question": question, "chat_history": chat_history}
+    )
+    logger.info(f"Transform response: {transform_response}")
+    return {"user_query": transform_response}
+def retrieve_document_fn(state: State):
+    question = state["user_query"].content
+    history = state["messages_history"]
+    filter = state.get("filter", None)
+    if not filter:
+        filter_response: ExtractFilter = extract_filter_chain.invoke(
+            {"question": question, "history": history}
+        )
+        logger.info(f"Extract filter response: {filter_response}")
+        job_title = filter_response.job_title
+        job_level = filter_response.job_level
+        filter = {}
+        if job_title:
+            filter["title"] = job_title
+        if job_level:
+            filter["level"] = job_level
+    retriever = vector_store.as_retriever(
+        search_type="similarity_score_threshold",
+        search_kwargs={"k": 5, "score_threshold": 0.0},
+    )
+    documents = retriever.invoke(question, filter=filter)
+    logger.info(f"Retrieved documents: {documents}")
+    return {"documents": documents}
+def grade_document_fn(state: State):
+    question = state["user_query"].content
+    documents = state["documents"]
+    inputs_bach = [
+        {"question": question, "document": doc.page_content} for doc in documents
+    ]
+    grade_response: list[GradeDocuments] = grade_documents_chain.batch(inputs_bach)
+    logger.info(f"Grade response: {grade_response}")
+    document_index = [
+        index for index, doc in enumerate(grade_response) if doc.binary_score == "yes"
+    ]
+    filtered_documents = [documents[i] for i in document_index]
+    return {"documents": filtered_documents}
+def generate_answer_rag_fn(state: State):
+    question = state["user_query"].content
+    documents = state["documents"]
+    language = state["language"]
+    if documents:
+        context_str = convert_list_context_source_to_str(documents)
+    gen_answer_response: GenerateAnswer = gen_answer_rag_chain.invoke(
+        {"question": question, "context": context_str, "language": language}
+    )
+    logger.info(f"Generate answer response: {gen_answer_response}")
+    id_selected = None
+    if gen_answer_response.selected_document_index is not None:
+        id_selected = documents[gen_answer_response.selected_document_index].metadata[
+            "id"
+        ]
+    logger.info(f"Document id selected: {id_selected}")
+    return {
+        "llm_response": gen_answer_response.answer,
+        "document_id_selected": id_selected,
+    }
+def grade_hallucinations_fn(state: State):
+    question = state["user_query"].content
+    llm_response = state["llm_response"]
+    grade_response: GradeHallucinations = grade_hallucinations_chain.invoke(
+        {"question": question, "generation": llm_response}
+    )
+    logger.info(f"Grade hallucinations response: {grade_response}")
+    return {"grade_response": grade_response.binary_score}
+def gen_answer_normal_fn(state: State):
+    question = state["user_query"].content
+    history = state["messages_history"]
+    gen_answer_response = gen_normal_answer_chain.invoke(
+        {"question": question, "history": history}
+    )
+    logger.info(f"Generate answer response: {gen_answer_response}")
+    return {"llm_response": gen_answer_response.content}

helper.py ADDED Viewed

	@@ -0,0 +1,9 @@

+from langchain_core.documents import Document
+def convert_list_context_source_to_str(contexts: list[Document]):
+    formatted_str = ""
+    for i, context in enumerate(contexts):
+        formatted_str += f"Document index {i}:\nContent: {context.page_content}\n"
+        formatted_str += "----------------------------------------------\n\n"
+    return formatted_str

job_data.xlsx ADDED Viewed

Binary file (112 kB). View file

logger.py ADDED Viewed

	@@ -0,0 +1,64 @@

+import logging
+import os
+from datetime import datetime
+from pathlib import Path
+import pytz
+class CoreCFG:
+    PROJECT_NAME = "RECRUITMENT"
+    BOT_NAME = str("RECRUITMENT")
+def get_date_time():
+    return datetime.now(pytz.timezone("Asia/Ho_Chi_Minh"))
+DATE_TIME = get_date_time().date()
+BASE_DIR = os.path.dirname(Path(__file__).parent.parent)
+LOG_DIR = os.path.join(BASE_DIR, "logs")
+class CustomFormatter(logging.Formatter):
+    green = "\x1b[0;32m"
+    grey = "\x1b[38;5;248m"
+    yellow = "\x1b[38;5;229m"
+    red = "\x1b[31;20m"
+    bold_red = "\x1b[31;1m"
+    blue = "\x1b[38;5;31m"
+    white = "\x1b[38;5;255m"
+    reset = "\x1b[38;5;15m"
+    base_format = f"{grey}%(asctime)s | %(name)s | %(threadName)s | {{level_color}}%(levelname)-8s{grey} | {blue}%(module)s:%(lineno)d{grey} - {white}%(message)s"
+    FORMATS = {
+        logging.INFO: base_format.format(level_color=green),
+        logging.WARNING: base_format.format(level_color=yellow),
+        logging.ERROR: base_format.format(level_color=red),
+        logging.CRITICAL: base_format.format(level_color=bold_red),
+    }
+    def format(self, record):
+        log_fmt = self.FORMATS.get(record.levelno)
+        formatter = logging.Formatter(log_fmt)
+        return formatter.format(record)
+def custom_logger(app_name="APP"):
+    logger_r = logging.getLogger(name=app_name)
+    tz = pytz.timezone("Asia/Ho_Chi_Minh")
+    logging.Formatter.converter = lambda *args: datetime.now(tz).timetuple()
+    ch = logging.StreamHandler()
+    ch.setLevel(logging.INFO)
+    ch.setFormatter(CustomFormatter())
+    logger_r.setLevel(logging.INFO)
+    logger_r.addHandler(ch)
+    return logger_r
+logger = custom_logger(app_name=CoreCFG.PROJECT_NAME)

prompt.py ADDED Viewed

	@@ -0,0 +1,207 @@

+from pydantic import BaseModel, Field
+from langchain_core.prompts import ChatPromptTemplate
+from typing import Literal
+from config.llm import llm_1, llm_2
+from typing import Optional
+class RouteQuery(BaseModel):
+    """Route a user query to the most relevant datasource."""
+    datasource: Literal["vectorstore", "casual_convo"] = Field(
+        ...,
+        description="Given a user question choose to route it to casual_convo or a vectorstore.",
+    )
+class ExtractFilter(BaseModel):
+    """Extract job level and job title from user question."""
+    job_level: str = Field(description="The level of the job the user is asking about.")
+    job_title: str = Field(description="The title of the job the user is asking about.")
+class GradeDocuments(BaseModel):
+    """Binary score for relevance check on retrieved documents."""
+    binary_score: str = Field(
+        description="Documents are relevant to the question, 'yes' or 'no'"
+    )
+class GenerateAnswer(BaseModel):
+    """Generate an answer based on the provided documents."""
+    answer: str = Field(description="Generated answer based on the provided documents.")
+    selected_document_index: Optional[int] = Field(
+        description="Index of the selected document."
+    )
+class GradeHallucinations(BaseModel):
+    """Binary score for grounding of generation answer in provided facts."""
+    binary_score: Literal["yes", "no"] = Field(
+        description="Whether the answer is grounded in the provided facts. 'yes' if the answer is supported by facts, 'no' if the answer contains information not present or contradicting the given facts"
+    )
+route_prompt = ChatPromptTemplate(
+    [
+        (
+            "system",
+            """You are an expert at routing the user's question to vectorstore or casual_convo.
+choose vectorstore if the question is related to job recruitment and casual_convo otherwise. \n
+vectorstore contains documents related to recruitment of human resources in all industries! Related to salary, recruitment position, job description for each job. Use vectorstore for questions about these topics that require some data and follow-up questions. Otherwise, if only need normal feedback and chat history, use casual_convo.
+example:
+user: Hi are you [this is a random question not related to recruitment so route to casual_convo] : casual_convo
+user: Jobs for junior dev? [this question is related to CS324 so route to vectorstore] : vectorstore""",
+        ),
+        ("human", "{question}"),
+    ]
+)
+re_write_query_prompt = ChatPromptTemplate(
+    [
+        (
+            "system",
+            """You a question re-writer that converts an input question to a better version that is optimized
+    for vectorstore retrieval, and very concise. Look at the input and try to reason about the underlying semantic intent/meaning. The input can also be a
+    follow up question, look at the chat history to re-write the question to include necessary info from the chat history to a better version that is optimized
+    for vectorstore retrieval without any other info needed. [the topic of convo will be generally around recruitment topic. You need to re-write query base on history and include keyword related to this topic""",
+        ),
+        ("placeholder", "{history}"),
+        (
+            "human",
+            "{question}",
+        ),
+    ]
+)
+extract_filter_prompt = ChatPromptTemplate.from_messages(
+    [
+        (
+            "system",
+            """You are an expert at extracting metadata from the user's question about recruitment and using it to filter the retrieved documents.
+    Fields to extract:
+      - Job level: The level of the job the user is asking about. Possible values are:
+        + intern
+        + fresher
+        + junior
+        + middle
+        + senior
+        + expert
+      - Job title: The title of the job the user is asking about.
+        + developer
+        + engineer
+        + designer
+        + business
+        + translator
+        + other (if the job title is not in the list above)
+    Note:
+      Leave the field blank if the information is not present in the user's question.
+        If the user's question contains multiple job titles, choose the most relevant one.
+        If the user's question contains multiple job levels, choose the most relevant one.
+        If not sure job-title or job-level, leave it blank.
+""",
+        ),
+        ("placeholder", "{history}"),
+        ("user", "{question}"),
+    ]
+)
+check_relevant_document_prompt = ChatPromptTemplate(
+    [
+        (
+            "system",
+            """
+You are a grader assessing relevance of a retrieved document to a user question.
+If the document contains keyword(s) or semantic meaning related to the user question, grade it as relevant.
+It does not need to be a stringent test. The goal is to filter out erroneous retrievals.
+Give a binary score 'yes' or 'no' score to indicate whether the document is relevant to the question.
+Then, give a score ranges from 0 to 1, with higher values indicating a stronger match and the more corresponding keywords.
+""",
+        ),
+        (
+            "human",
+            "Retrieved document: \n\n {document} \nvs\n User question: {question}",
+        ),
+    ]
+)
+gen_answer_rag_prompt = ChatPromptTemplate(
+    [
+        (
+            "system",
+            """You are chat bot related to recruitment. You are asked to generate an answer based on the provided documents.
+Your are given context related to job description of a job position. If the context not provided, you just sau 'không có tài liệu liên quan'
+Answer in {language} language.
+Context:
+```
+{context}
+```
+""",
+        ),
+        (
+            "human",
+            """
+    Question: {question}
+                """,
+        ),
+    ]
+)
+grade_answer_prompt = ChatPromptTemplate(
+    [
+        (
+            "system",
+            """You are a grader assessing whether an answer addresses / resolves a question \n
+    Give a binary score 'yes' or 'no'. Yes' means that the answer resolves the question.
+    If the LLM Generation is saying that it doesnt know or not sure or stating to keep the questions relevant to topic , grade it as 'yes'.""",
+        ),
+        (
+            "human",
+            "If the LLM Generation is saying that it doesnt know or not sure or stating to keep the questions relevant to topic , grade it as 'yes'. User question: \n\n {question} \n\n LLM generation: {generation}",
+        ),
+    ]
+)
+gen_normal_answer_prompt = ChatPromptTemplate(
+    [
+        (
+            "system",
+            """You are assitant related recruitment. If user ask about seeking job, you can ask more detail about job title, job level, etc. If user ask about another topic, you can say that you don't know. Not answer a job information.
+            Only answer related to hot trend in recruitment domain. Not give user a job hiring information. Can using history like a context to prov
+            """,
+        ),
+        ("placeholder", "{history}"),
+        ("human", "{question}"),
+    ]
+)
+gen_answer_history_prompt = ChatPromptTemplate(
+    [
+        (
+            "system",
+            "You are assitant related recruitment. Using your knowledge about recruitment domain. If you not sure about the answer, just say that you don't know.",
+        ),
+        ("placeholder", "{history}"),
+        ("user", "{question}"),
+    ]
+)
+route_chain = route_prompt | llm_1.with_structured_output(RouteQuery)
+transform_query_chain = re_write_query_prompt | llm_1
+extract_filter_chain = extract_filter_prompt | llm_1.with_structured_output(ExtractFilter)
+grade_documents_chain = check_relevant_document_prompt | llm_2.with_structured_output(
+    GradeDocuments
+)
+gen_answer_rag_chain = gen_answer_rag_prompt | llm_2.with_structured_output(
+    GenerateAnswer
+)
+gen_normal_answer_chain = gen_normal_answer_prompt | llm_1
+grade_hallucinations_chain = grade_answer_prompt | llm_1.with_structured_output(
+    GradeHallucinations
+)

requirments.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+langchain-pinecone
+langgraph
+langchain-google-genai