from langchain.output_parsers import ResponseSchema, StructuredOutputParser
import gradio as gr
import os, json
import cv2, pytesseract
import fitz
from concurrent.futures import ThreadPoolExecutor
from PIL import Image
from langchain_ollama import ChatOllama
from langchain_core.prompts import ChatPromptTemplate
from langchain.vectorstores.chroma import Chroma # Importing Chroma vector store from Langchain
from langchain.text_splitter import RecursiveCharacterTextSplitter # Importing text splitter from Langchain
# Import required libraries
from langchain_ollama import OllamaEmbeddings, OllamaLLM
from langchain_core.output_parsers import JsonOutputParser
# from langchain_text_splitters import CharacterTextSplitter
import chromadb
import shutil
import json
import pandas as pd
from langchain_core.documents import Document
from pathlib import Path
import regex as re
import tempfile
import zipfile
from openai import OpenAI

review_template = """
You are extracting structured information from the given text. 
ONLY use the information explicitly available in the text provided. 
If a specific field cannot be extracted from the input text, respond with 'null'.

Input text:
{text}

Provide outputs in the following format:
{format_instructions}
"""


# Projects Experience: Give me summary of projects in the format of dictionary format  as keys as Project name, Time period he did the project, and summary of project in bullet points.
# Projects Experience
name_schema = ResponseSchema(name="Name",
                         description="Name of the person in resume text applying for job? Answer noun as string or unknown.")
job_role_schema = ResponseSchema(name="Job_role",
                                    description="What is the job role the person is applying for?")
skills_schema = ResponseSchema(name="Skills",
                                    description="All the skill in resume text  and output them as a comma separated Python list.",type='list')
exp_schema = ResponseSchema(name="Experience",
                                    description="How much experience in years he has in resume text which is a number",type='integer')
info_schema = ResponseSchema(name="Personal Information",
                                    description="Extract the information of the person like Phone number, Address, City, Gender, Gmail and extract and save it in dictionary as key and values.",type='dictionary')
prof_schema = ResponseSchema(name="Profile",
                                    description="What is his profile he is mentioned in text for the job application and summarize it.",type='string')
linkedin_schema = ResponseSchema(name="Linkedin",
                                    description="Linkedin link if available else unknown",type='string')
# proj_schema = ResponseSchema(name="Projects Experience",
#                                     description="Give me summary of projects in the format of dictionary format  as keys as Project name, Time period he did the project, and summary of project in bullet points.",type='dictionary')


csv_path='./resumes.csv'

# Initialize the DeepSeek client
client = OpenAI(
    api_key='sk-02f34bd0ea4849e8a4232bc656e28727',  # Replace with your DeepSeek API key
    base_url="https://api.deepseek.com/v1",
)


# pytesseract.pytesseract.tesseract_cmd = r'C:\Users\ashasrikar.paritala\AppData\Local\Programs\Tesseract-OCR\tesseract.exe'
# text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
#     encoding_name="cl100k_base", chunk_size=100, chunk_overlap=0
# )


from langchain_deepseek import ChatDeepSeek
import os
os.environ["DEEPSEEK_API_KEY"]='sk-02f34bd0ea4849e8a4232bc656e28727'
llm = ChatDeepSeek(
    model="deepseek-chat",
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2,
    # other params...
)

# embedding = OllamaEmbeddings(model="deepseek-chat")

chat_prompt = ChatPromptTemplate.from_template(review_template)
# chat_prompt_message = chat_prompt.format_messages(text=pdf_text, format_instructions=format_instructions)

# Specify the response schema all the attribute we are expecting
response_schemas = [name_schema, 
                    job_role_schema,
                    skills_schema,
                   exp_schema,
                   info_schema,
                   prof_schema,
                   linkedin_schema]

# proj_schema

output_parser = StructuredOutputParser.from_response_schemas(response_schemas)
format_instructions = output_parser.get_format_instructions()

def img_extract(img_file):
    img = cv2.imread(img_file)  
    text = pytesseract.image_to_string(img)
    # print(text)
    return text

def pdf_pages_image(page,matrix_scale=3):
    matrix=fitz.Matrix(matrix_scale,matrix_scale)
    image=page.get_pixmap(matrix=matrix)
    return Image.frombytes("RGB",[image.width,image.height],image.samples)

def pdf_extract(pdf_file): 
    pdf_text=''
    pdf_doc=fitz.open(pdf_file)
    num_pages=pdf_doc.page_count
    print(num_pages)
    with ThreadPoolExecutor(max_workers=10) as executor:
        images=list(executor.map(lambda page:pdf_pages_image(pdf_doc[page],5),range(num_pages)))
        print(len(images))
        for image in images:
            pdf_text+= pytesseract.image_to_string(image)
    return pdf_text

def zip_extract(file):
    text=''
    # Open the zip file
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        # Get list of all file names in the zip
        for file_info in zip_ref.infolist():
            print(f"File name: {file_info.filename}")
            file_path=Path(file_info.filename)
            print(file_path)

def parsing(text):
    chat_prompt_message = chat_prompt.format_messages(text=text, format_instructions=format_instructions)
    # print(chat_prompt_message)
    # Calling the LLM
    response = get_completion(chat_prompt_message)
    # Convert the result into the expected output format
    print(response)
    from langchain.output_parsers import OutputFixingParser
    # fix_parser = OutputFixingParser.from_llm(parser=output_parser, llm=llm)
    json_parser = JsonOutputParser()
    output_dict=json_parser.invoke(response)
    print(output_dict)
    # output_dict=fix_parser.parse(response)
    # output_dict = output_parser.parse(response) # type: ignore
    print(type(output_dict))
    return output_dict
    

def file_extract(file,extension):
    text=''
    if extension.lower() in ('.png', '.jpg', '.jpeg'):
        text=img_extract(file)
    elif extension.lower()=='.pdf':
        print('pdf')
        text=pdf_extract(file)
    print(text)

    return text

def get_completion(prompt):
    ai_msg = llm.invoke(prompt)
    return ai_msg.content


def correct_OCR(text):
    try:
        response = client.chat.completions.create(
            model="deepseek-chat",
            messages=[
                {"role": "system", "content": "You are a helpful assistant that checks and corrects spelling mistakes in defective OCR text. Understanding layout of text and please reorganize the text into  respective headings to respective text"},
                {"role": "user", "content": f"Content:\n{text}"}
            ],
            temperature=0.7,
        )
        ds_text = response.choices[0].message.content.strip()
        return ds_text
    except Exception as e:
        print(f"Error while correcting OCR: {e}")
        return text

def processing(filepath):
    filename = os.path.basename(filepath)
    extension = os.path.splitext(filepath)[1]
    print(extension)

    extracted_text = file_extract(filepath, extension)
    corrected_text = correct_OCR(extracted_text)
    
    parsed_dict = parsing(corrected_text)  # dict
    json_output = json.dumps(parsed_dict, indent=2)  # For display purposes only
    
    # Flatten the nested JSON and convert to DataFrame
    df = pd.json_normalize(parsed_dict)
    
    # Ensure 'Skills' column is in string format for CSV
    df['Skills'] = df['Skills'].apply(lambda x: ', '.join(x) if isinstance(x, list) else x)
    
    # Add the filename to the DataFrame
    df['filename'] = filename
    
    # Define consistent column order
    desired_columns = [
        "Name", "Job_role", "Experience", "Skills", "Profile", "Linkedin",
        "Personal Information.Phone", "Personal Information.Gmail",
        "Personal Information.Address", "Personal Information.City",
        "filename"
    ]
    for col in desired_columns:
        if col not in df.columns:
            df[col] = None
    df = df[desired_columns]

    # Append to CSV only if Name is unique
    if os.path.exists(csv_path):
        existing_df = pd.read_csv(csv_path)
        if df['Name'].iloc[0] in existing_df['Name'].values:
            print(f"Duplicate entry found for Name: {df['Name'].iloc[0]}. Skipping...")
            return json_output
        df.to_csv(csv_path, mode="a", index=False, header=False)
    else:
        df.to_csv(csv_path, mode="w", index=False, header=True)
    return json_output
        

def resume_parser(filepath):
    print(filepath)
    if filepath:
        ext = os.path.splitext(filepath)[1]
        print(ext)  # Output: .jpg
        if ext=='.pdf' or ext=='docx':
            print(filepath)
            json_output=processing(filepath)     
            # # Push DataFrame into database as 'resume_data' table
            gr.Info('Data moved to database')
            return json_output
        elif ext=='.zip':
            zip_files = {}

            # Create temporary directory
            with tempfile.TemporaryDirectory() as temp_dir:
                print(f"Extracting to temp dir: {temp_dir}")
                
                with zipfile.ZipFile(filepath, 'r') as zip_ref:
                    zip_ref.extractall(temp_dir)

                    for file_info in zip_ref.infolist():
                        if file_info.is_dir():
                            continue
                        
                        try:
                            extracted_file_path = os.path.join(temp_dir, file_info.filename)
                            print(f"Processing: {extracted_file_path}")
                            
                            json_output = processing(extracted_file_path)
                            gr.Info(f'{file_info.filename} moved to database')
                            zip_files[file_info.filename] = 'processed'
                        
                        except Exception as err:
                            print(str(err))
                            gr.Warning(f'{file_info.filename} not processed')
                            zip_files[file_info.filename] = 'not processed'

            # 🔥 No need to manually delete, `TemporaryDirectory()` auto-cleans up
            return json.dumps(zip_files)

    else:
        raise gr.Error('No file selected')


def preprocess_skills(skill_text):
    # Split based on comma, slash, or space (optional) and lowercase
    if isinstance(skill_text, list):
        return [s.strip().lower() for s in skill_text]
    return [s.strip().lower() for s in re.split(r"[,/|&\-\s]+", str(skill_text)) if s.strip()]

def get_filtered_rows(exp, skills_description):
    try:
        exp_filter = int(exp) if exp else None
        user_skills = preprocess_skills(skills_description) if skills_description else []

        # Load CSV instead of DB
        df = pd.read_csv("./resumes.csv")

        # Return message if no input provided
        if not user_skills and exp_filter is None:
            return pd.DataFrame([{"Message": "Please enter Experience and/or Skills to filter."}])

        if user_skills:
            def skill_match_ratio(candidate_skills):
                candidate_list = preprocess_skills(candidate_skills)
                matches = len(set(user_skills) & set(candidate_list))
                return matches / len(user_skills) if user_skills else 0

            df["match_ratio"] = df["Skills"].apply(skill_match_ratio)
            df = df[df["match_ratio"] >= 0.4]  # prioritize skills match

        # Now apply experience filter only if provided
        if exp_filter is not None:
            df = df[(df["Experience"].isna()) | (df["Experience"] >= exp_filter)]

        if not df.empty:
            df = df.sort_values(
                by=["match_ratio" if "match_ratio" in df.columns else "Experience", "Experience"],
                ascending=[False, False]
            )
            return df.drop(columns=["match_ratio"], errors="ignore")
        else:
            return pd.DataFrame([{"Message": "No matching candidates found."}])

    except Exception as e:
        return pd.DataFrame([{"Error": str(e)}])