Spaces:
Sleeping
Sleeping
| print("stage z") | |
| #from multiprocessing import Process, freeze_support | |
| import datetime | |
| from io import StringIO | |
| from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, Settings | |
| from llama_index.readers.file import CSVReader | |
| #from llama_index.embeddings.huggingface import HuggingFaceEmbedding | |
| #from llama_index.llms.ollama import Ollama | |
| from llama_index.core.response.notebook_utils import display_source_node | |
| #from llama_index.readers.file import ImageReader | |
| import tiktoken | |
| #from openai import OpenAI | |
| from llama_index.llms.openai import OpenAI | |
| print("stage f") | |
| import gradio as gr | |
| import sys | |
| import csv | |
| csv.field_size_limit(sys.maxsize) | |
| import nltk | |
| import os | |
| #root = os.path.dirname(os.path.abspath("__file__")) | |
| root = os.path.dirname(os.path.abspath("__file__")) | |
| print(root) | |
| download_dir = os.path.join(root, 'nltk_data') | |
| print(download_dir) | |
| os.chdir(download_dir) | |
| nltk.data.path.append(download_dir) | |
| os.chdir("..") | |
| #os.chdir(root) | |
| os.environ["SPACY_WARNING_IGNORE"] = "W008" | |
| import warnings | |
| warnings.filterwarnings("ignore", message=r"\[W008\]", category=UserWarning) | |
| from llama_index.core.node_parser import ( | |
| SentenceSplitter, | |
| SemanticSplitterNodeParser, | |
| SemanticDoubleMergingSplitterNodeParser, | |
| LanguageConfig, | |
| ) | |
| # def get_meta(file_path): | |
| # return {"foo": "bar", "file_path": file_path} | |
| print("CP import") | |
| demo = "" | |
| query_engine = "" | |
| role_research = """Refer to the files provided to answer the task below. | |
| Your task is to try your best find a job, skill, industry or competency described in the files that is vaguely similar to the description below. | |
| If the description below is about a skill or competency, then your response should show similar skill and competencies from the file. | |
| If you cannot find any matching content in the files, pick the most similar one and suggest how it might relate the description below. | |
| You should end your response with these exact words "(Note: This is just a high-level summary of 1 or 2 aspects. Please refer to the Results below to view more. Click Download to save this search.)" | |
| Description: | |
| """ | |
| def startup(): | |
| global vector_index2 | |
| # # defined global LLM | |
| Settings.llm = OpenAI(temperature=0, model="gpt-4o-mini", max_tokens=16383) | |
| Settings.tokenizer = tiktoken.encoding_for_model("gpt-4o-mini").encode | |
| print("defined global LLM") | |
| # # maximum input size to the LLM | |
| Settings.context_window = 128000 | |
| print("maximum input size to the LLM") | |
| # # number of tokens reserved for text generation. | |
| Settings.num_output = 16000 | |
| print("number of tokens reserved for text generation.") | |
| #file_extractor = {".jpg": ImageReader(parse_text=True)} | |
| #reader = SimpleDirectoryReader(input_dir="data", recursive=True) #, file_extractor= file_extractor) | |
| #documents = reader.load_data(num_workers=4) #num_workers=4 | |
| # all_docs = [] | |
| # for docs in reader.iter_data(): | |
| # # <do something with the documents per file> | |
| # print (docs) | |
| # all_docs.extend(docs) | |
| print("loading data") | |
| # CSV Reader example | |
| parser = CSVReader(concat_rows=False) | |
| file_extractor = {".csv": parser} # Add other CSV formats as needed | |
| documents = SimpleDirectoryReader( | |
| "./data", recursive=True, file_extractor=file_extractor | |
| ).load_data(num_workers=4) | |
| print("load data done") | |
| #Settings.chunk_size = 512 | |
| #print("starting embed") | |
| # For a Local setup: | |
| #Settings.embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-base-en-v1.5") # base_url="http://host.docker.internal:11434" | |
| #print("embed done") | |
| #embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-base-en-v1.5") | |
| #Settings.llm = Ollama(model="llama3.2", request_timeout=360.0) #llama3.1:latest | |
| #print("ollama up") | |
| print("semantic doublemerging...") | |
| config = LanguageConfig(language="english", spacy_model="en_core_web_md") | |
| print("parser...") | |
| splitter_double = SemanticDoubleMergingSplitterNodeParser( | |
| language_config=config, | |
| initial_threshold=0.3, | |
| appending_threshold=0.4, | |
| merging_threshold=0.7, | |
| #max_chunk_size=5000, | |
| ) | |
| print("parser...done") | |
| # Parameters | |
| # Thresholds in the algorithm control the grouping of sentences into chunks (in the first pass) and chunks into larger chunks (in the second pass). Here’s a brief overview of the three thresholds: | |
| # - initial_threshold: Specifies the similarity needed for initial sentences to form a new chunk. A higher value creates more focused chunks but may result in smaller chunks. | |
| # - appending_threshold: Determines the minimum similarity required for adding sentences to an existing chunk. A higher value promotes cohesive chunks but may result in fewer sentences being added. | |
| # - merging_threshold: Sets the similarity level for merging chunks. Higher value consolidates related chunks but risks merging unrelated ones. | |
| # For optimal performance, set the appending_threshold and merging_threshold relatively high to ensure cohesive and relevant chunks, while keeping the initial_threshold slightly lower to capture a broader range of semantic relationships. Adjust these thresholds based on text characteristics and desired chunking outcomes. Additionally, examples should be added: monothematic text should have higher merging_threshold and appending_threshold in order to differentiate chunks, even if the text is highly related, and to avoid classifying the entire text as a single chunk. | |
| # https://bitpeak.pl/chunking-methods-in-rag-methods-comparison/ | |
| # | |
| # Remember that different spaCy models and various parameter values can perform differently on specific texts. | |
| # A text that clearly changes its subject matter should have lower threshold values to easily detect these changes. | |
| # Conversely, a text with a very uniform subject matter should have high threshold values to help split the text | |
| # into a greater number of chunks. For more information and comparison with different chunking methods check | |
| # https://bitpeak.pl/chunking-methods-in-rag-methods-comparison/ | |
| print("getting nodes from docs") | |
| global nodes2 | |
| nodes2 = splitter_double.get_nodes_from_documents(documents) | |
| print("semantic doublemerging... done") | |
| print("vectoring...") | |
| vector_index2 = VectorStoreIndex(nodes2, show_progress=True) | |
| print("vectoring...done") | |
| # index = VectorStoreIndex.from_documents(documents) # to get all nodes | |
| grat = """An astronaut. | |
| He is a person trained, equipped, and deployed by a human spaceflight program to serve as a commander or crew member aboard a spacecraft. | |
| Although generally reserved for professional space travelers, the term is sometimes applied to anyone who travels into space, including scientists, politicians, journalists, and tourists.""" | |
| global demo | |
| demo = gr.Interface( | |
| estl_ux_app, | |
| [ | |
| # gr.Textbox( | |
| # label="Header Prompt", | |
| # info="What's AI's role and goals?", | |
| # lines=5, | |
| # value=role_research, | |
| # ), | |
| gr.Textbox( | |
| label="Main Prompt", | |
| info="Describe an industry, job role, skills and/or competencies", | |
| lines=10, | |
| value=grat), | |
| # gr.Radio(["compact_accumulate", "refine", "tree_summarize", "compact", "simple_summarize", "no_text", "accumulate"], | |
| # label="Response Mode", | |
| # value="tree_summarize", | |
| # info=""" | |
| # Choose the response mode. | |
| # More https://docs.llamaindex.ai/en/stable/module_guides/deploying/query_engine/response_modes/"""), | |
| gr.Textbox( | |
| label="Number of items to find", | |
| info="How many items?", | |
| lines=1, | |
| value="5", | |
| ) | |
| ], | |
| [ | |
| gr.Textbox( | |
| label="AI response", | |
| info="Final output from LLM", | |
| max_lines=20, | |
| value= | |
| """ | |
| Please click on submit first. | |
| """, | |
| ), | |
| gr.Textbox( | |
| label="Results", | |
| info="Items which match", | |
| lines=30, | |
| autoscroll=False, | |
| value="(Click on submit first)", | |
| ), | |
| gr.DownloadButton("Download 'your_sfw.csv' file (Click on submit first)", visible=True, value="your_sfw.csv") | |
| ], | |
| examples=[ | |
| [ "Soldier" , 10], | |
| [ "Firefighter" , 10], | |
| [ "Astronaut" , 10], | |
| ["President of singapore", 5], | |
| ["Clown", 5], | |
| ["Religious worker", 10], | |
| ["Pastor", 10] | |
| ], | |
| title="SMAIT (Skills Map AI Tool) [Demo version only]", | |
| description="""Don't want to read through 48,000+ rows of SFw Dataset? | |
| Use this app to find possible matches based on your description of the industry, job role, skills and/or competencies. | |
| Need search ideas? Scroll down and click on the examples. | |
| Note: Click DOWNLOAD (bottom of page) to save your search!""" | |
| ) | |
| print("CP startup") | |
| def estl_ux_app(main_prompt, similarity_top_k): | |
| #global vector_index2 | |
| header_prompt=role_research | |
| response_mode_type = "tree_summarize" #"tree_summarize" | |
| query_engine = vector_index2.as_query_engine(response_mode=response_mode_type, similarity_top_k=int(similarity_top_k))# compact_accumulate refine tree_summarize context_only streaming=True | |
| response2 = query_engine.query(header_prompt + main_prompt) | |
| #response.print_response_stream() | |
| # print(response2) | |
| #ct = datetime.datetime.now() | |
| #fileid = (str(ct).replace(":","").split(".")[0]) | |
| #remove_spec =''.join(e for e in main_prompt if e.isalnum()) | |
| #finalname = fileid+" "+remove_spec[:50]+" sfw.csv" | |
| with open("your_sfw.csv", 'w', newline='', encoding='utf-8') as csvfile: | |
| writer = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL) | |
| counter =0 | |
| chosen_nodes = "" | |
| sel_chunks = "" | |
| #writer.writerow(["Choice", "node_id", "path", "similarity", "text"]) | |
| writer.writerow(["ID", "Sector", "TCS_CCS Sector", "Track", "Job Role", | |
| "Job Role Description", "Critical Work Function", "Key Tasks", | |
| "Knowledge or Ability Items", "Range of Application", "CCS_TSC Title or TSC_CCS", | |
| "TSC_CCS_Type", "Proficiency Level", "TSC_CCS Code", "TSC_CCS Description", | |
| "TSC_CCS Category", "Proficiency Level", "Proficiency Description", "Knowledge or Ability Classification"]) | |
| for n in response2.source_nodes: | |
| counter+=1 | |
| ntext = n.text | |
| if ntext[:3] == "|ID": | |
| print("skipped") | |
| continue | |
| chosen_nodes += "CHOICE " + str(counter) + " ###################################### \n" | |
| #chosen_nodes += "\n\n - *Node ID*: " + n.node_id | |
| #chosen_nodes += "\n - *File path*: " + n.metadata['file_path'] | |
| #chosen_nodes += "\n - *File name*: " + n.metadata['file_name'] | |
| chosen_nodes += "\n - Similarity: " + str(round(n.score*100)) + "%" | |
| chosen_nodes += "\n - Content: " + (ntext.replace("|, |","\n\n")).replace("\n\n\n\n\n\n\n\n","\n") + "\n\n\n\n" | |
| #writer.writerow([str(counter), n.node_id, n.metadata['file_path'], str(n.score), n.text]) | |
| #f = StringIO(n.text) | |
| #data = [[c.replace('\ufeff', '') for c in row] for row in reader] | |
| #trans = list(csv.reader(f))[0] | |
| #removeu = [c.replace('\ufeff', '') for c in trans] | |
| moveti = n.text.split("|, |") | |
| #print("write: " + str(moveti)) | |
| writer.writerow(moveti) | |
| # print(n.node_id) | |
| # print(n.metadata['file_path']) | |
| #print(chosen_nodes) | |
| # counter =0 | |
| # all_chunks = "" | |
| # for item in nodes2: | |
| # counter+=1 | |
| # all_chunks += "CHUNK " + str(counter) + " ###################################### \n\n" | |
| # writer.writerow([str(counter), "All chunks", item.get_metadata_str(), "NA", item.get_content()]) | |
| # all_chunks += item.get_metadata_str() + "\n\n" | |
| # all_chunks += "Text: \n" + item.get_content() + "\n\n\n\n" | |
| #print(all_chunks) | |
| print("cp aclp_ux_app") | |
| return response2,chosen_nodes, "your_sfw.csv" #,all_chunks | |
| def update_message(request: gr.Request): | |
| return f"Welcome, {request.username}" | |
| print("launching") | |
| if __name__ == "__main__": | |
| # freeze_support() | |
| # Process(target=startup).start() | |
| # Process(target=startup).start() | |
| startup() | |
| # with gr.Blocks() as demo: | |
| # m = gr.Markdown() | |
| # logout_button = gr.Button("Logout", link="/logout") | |
| # demo.load(update_message, None, m) | |
| #demo.launch(auth=[("user", "nathan")]) | |
| demo.launch() | |
| print("launched") | |