Spaces:

Nathanation
/

aclp-m1p

Sleeping

App Files Files Community

aclp-m1p / app.py

Nathanation

Update app.py

ea3be04 verified about 1 year ago

raw

history blame contribute delete

13.6 kB

	print("stage z")
	#from multiprocessing import Process, freeze_support
	import datetime
	from io import StringIO
	from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, Settings
	from llama_index.readers.file import CSVReader

	#from llama_index.embeddings.huggingface import HuggingFaceEmbedding
	#from llama_index.llms.ollama import Ollama
	from llama_index.core.response.notebook_utils import display_source_node

	#from llama_index.readers.file import ImageReader
	import tiktoken
	#from openai import OpenAI
	from llama_index.llms.openai import OpenAI
	print("stage f")

	import gradio as gr

	import sys
	import csv
	csv.field_size_limit(sys.maxsize)




	import nltk
	import os
	#root = os.path.dirname(os.path.abspath("__file__"))
	root = os.path.dirname(os.path.abspath("__file__"))
	print(root)
	download_dir = os.path.join(root, 'nltk_data')
	print(download_dir)
	os.chdir(download_dir)
	nltk.data.path.append(download_dir)
	os.chdir("..")
	#os.chdir(root)


	os.environ["SPACY_WARNING_IGNORE"] = "W008"
	import warnings
	warnings.filterwarnings("ignore", message=r"\[W008\]", category=UserWarning)


	from llama_index.core.node_parser import (
	SentenceSplitter,
	SemanticSplitterNodeParser,
	SemanticDoubleMergingSplitterNodeParser,
	LanguageConfig,
	)

	# def get_meta(file_path):
	# return {"foo": "bar", "file_path": file_path}



	print("CP import")

	demo = ""
	query_engine = ""

	role_research = """Refer to the files provided to answer the task below.
	Your task is to try your best find a job, skill, industry or competency described in the files that is vaguely similar to the description below.
	If the description below is about a skill or competency, then your response should show similar skill and competencies from the file.
	If you cannot find any matching content in the files, pick the most similar one and suggest how it might relate the description below.

	You should end your response with these exact words "(Note: This is just a high-level summary of 1 or 2 aspects. Please refer to the Results below to view more. Click Download to save this search.)"

	Description:
	"""



	def startup():
	global vector_index2




	# # defined global LLM
	Settings.llm = OpenAI(temperature=0, model="gpt-4o-mini", max_tokens=16383)
	Settings.tokenizer = tiktoken.encoding_for_model("gpt-4o-mini").encode
	print("defined global LLM")

	# # maximum input size to the LLM
	Settings.context_window = 128000
	print("maximum input size to the LLM")

	# # number of tokens reserved for text generation.
	Settings.num_output = 16000
	print("number of tokens reserved for text generation.")


	#file_extractor = {".jpg": ImageReader(parse_text=True)}

	#reader = SimpleDirectoryReader(input_dir="data", recursive=True) #, file_extractor= file_extractor)
	#documents = reader.load_data(num_workers=4) #num_workers=4


	# all_docs = []
	# for docs in reader.iter_data():
	# # <do something with the documents per file>
	# print (docs)
	# all_docs.extend(docs)

	print("loading data")
	# CSV Reader example
	parser = CSVReader(concat_rows=False)
	file_extractor = {".csv": parser} # Add other CSV formats as needed
	documents = SimpleDirectoryReader(
	"./data", recursive=True, file_extractor=file_extractor
	).load_data(num_workers=4)
	print("load data done")




	#Settings.chunk_size = 512


	#print("starting embed")
	# For a Local setup:
	#Settings.embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-base-en-v1.5") # base_url="http://host.docker.internal:11434"
	#print("embed done")
	#embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-base-en-v1.5")
	#Settings.llm = Ollama(model="llama3.2", request_timeout=360.0) #llama3.1:latest
	#print("ollama up")




	print("semantic doublemerging...")
	config = LanguageConfig(language="english", spacy_model="en_core_web_md")
	print("parser...")
	splitter_double = SemanticDoubleMergingSplitterNodeParser(
	language_config=config,
	initial_threshold=0.3,
	appending_threshold=0.4,
	merging_threshold=0.7,
	#max_chunk_size=5000,
	)
	print("parser...done")


	# Parameters

	# Thresholds in the algorithm control the grouping of sentences into chunks (in the first pass) and chunks into larger chunks (in the second pass). Here’s a brief overview of the three thresholds:
	# - initial_threshold: Specifies the similarity needed for initial sentences to form a new chunk. A higher value creates more focused chunks but may result in smaller chunks.
	# - appending_threshold: Determines the minimum similarity required for adding sentences to an existing chunk. A higher value promotes cohesive chunks but may result in fewer sentences being added.
	# - merging_threshold: Sets the similarity level for merging chunks. Higher value consolidates related chunks but risks merging unrelated ones.

	# For optimal performance, set the appending_threshold and merging_threshold relatively high to ensure cohesive and relevant chunks, while keeping the initial_threshold slightly lower to capture a broader range of semantic relationships. Adjust these thresholds based on text characteristics and desired chunking outcomes. Additionally, examples should be added: monothematic text should have higher merging_threshold and appending_threshold in order to differentiate chunks, even if the text is highly related, and to avoid classifying the entire text as a single chunk.

	# https://bitpeak.pl/chunking-methods-in-rag-methods-comparison/

	#
	# Remember that different spaCy models and various parameter values can perform differently on specific texts.
	# A text that clearly changes its subject matter should have lower threshold values to easily detect these changes.
	# Conversely, a text with a very uniform subject matter should have high threshold values to help split the text
	# into a greater number of chunks. For more information and comparison with different chunking methods check
	# https://bitpeak.pl/chunking-methods-in-rag-methods-comparison/


	print("getting nodes from docs")
	global nodes2
	nodes2 = splitter_double.get_nodes_from_documents(documents)
	print("semantic doublemerging... done")
	print("vectoring...")
	vector_index2 = VectorStoreIndex(nodes2, show_progress=True)
	print("vectoring...done")


	# index = VectorStoreIndex.from_documents(documents) # to get all nodes



	grat = """An astronaut.
	He is a person trained, equipped, and deployed by a human spaceflight program to serve as a commander or crew member aboard a spacecraft.
	Although generally reserved for professional space travelers, the term is sometimes applied to anyone who travels into space, including scientists, politicians, journalists, and tourists."""


	global demo
	demo = gr.Interface(
	estl_ux_app,
	[
	# gr.Textbox(
	# label="Header Prompt",
	# info="What's AI's role and goals?",
	# lines=5,
	# value=role_research,
	# ),
	gr.Textbox(
	label="Main Prompt",
	info="Describe an industry, job role, skills and/or competencies",
	lines=10,
	value=grat),
	# gr.Radio(["compact_accumulate", "refine", "tree_summarize", "compact", "simple_summarize", "no_text", "accumulate"],
	# label="Response Mode",
	# value="tree_summarize",
	# info="""
	# Choose the response mode.
	# More https://docs.llamaindex.ai/en/stable/module_guides/deploying/query_engine/response_modes/"""),
	gr.Textbox(
	label="Number of items to find",
	info="How many items?",
	lines=1,
	value="5",
	)
	],
	[
	gr.Textbox(
	label="AI response",
	info="Final output from LLM",
	max_lines=20,
	value=
	"""
	Please click on submit first.
	""",
	),
	gr.Textbox(
	label="Results",
	info="Items which match",
	lines=30,
	autoscroll=False,
	value="(Click on submit first)",
	),
	gr.DownloadButton("Download 'your_sfw.csv' file (Click on submit first)", visible=True, value="your_sfw.csv")

	],

	examples=[
	[ "Soldier" , 10],
	[ "Firefighter" , 10],
	[ "Astronaut" , 10],
	["President of singapore", 5],
	["Clown", 5],
	["Religious worker", 10],
	["Pastor", 10]

	],



	title="SMAIT (Skills Map AI Tool) [Demo version only]",
	description="""Don't want to read through 48,000+ rows of SFw Dataset?
	Use this app to find possible matches based on your description of the industry, job role, skills and/or competencies.
	Need search ideas? Scroll down and click on the examples.
	Note: Click DOWNLOAD (bottom of page) to save your search!"""
	)

	print("CP startup")




	def estl_ux_app(main_prompt, similarity_top_k):

	#global vector_index2
	header_prompt=role_research
	response_mode_type = "tree_summarize" #"tree_summarize"
	query_engine = vector_index2.as_query_engine(response_mode=response_mode_type, similarity_top_k=int(similarity_top_k))# compact_accumulate refine tree_summarize context_only streaming=True

	response2 = query_engine.query(header_prompt + main_prompt)
	#response.print_response_stream()
	# print(response2)


	#ct = datetime.datetime.now()
	#fileid = (str(ct).replace(":","").split(".")[0])
	#remove_spec =''.join(e for e in main_prompt if e.isalnum())
	#finalname = fileid+" "+remove_spec[:50]+" sfw.csv"


	with open("your_sfw.csv", 'w', newline='', encoding='utf-8') as csvfile:
	writer = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL)



	counter =0
	chosen_nodes = ""
	sel_chunks = ""
	#writer.writerow(["Choice", "node_id", "path", "similarity", "text"])
	writer.writerow(["ID", "Sector", "TCS_CCS Sector", "Track", "Job Role",
	"Job Role Description", "Critical Work Function", "Key Tasks",
	"Knowledge or Ability Items", "Range of Application", "CCS_TSC Title or TSC_CCS",
	"TSC_CCS_Type", "Proficiency Level", "TSC_CCS Code", "TSC_CCS Description",
	"TSC_CCS Category", "Proficiency Level", "Proficiency Description", "Knowledge or Ability Classification"])


	for n in response2.source_nodes:
	counter+=1
	ntext = n.text
	if ntext[:3] == "\|ID":
	print("skipped")
	continue

	chosen_nodes += "CHOICE " + str(counter) + " ###################################### \n"


	#chosen_nodes += "\n\n - Node ID: " + n.node_id


	#chosen_nodes += "\n - File path: " + n.metadata['file_path']


	#chosen_nodes += "\n - File name: " + n.metadata['file_name']
	chosen_nodes += "\n - Similarity: " + str(round(n.score*100)) + "%"


	chosen_nodes += "\n - Content: " + (ntext.replace("\|, \|","\n\n")).replace("\n\n\n\n\n\n\n\n","\n") + "\n\n\n\n"


	#writer.writerow([str(counter), n.node_id, n.metadata['file_path'], str(n.score), n.text])
	#f = StringIO(n.text)

	#data = [[c.replace('\ufeff', '') for c in row] for row in reader]
	#trans = list(csv.reader(f))[0]
	#removeu = [c.replace('\ufeff', '') for c in trans]
	moveti = n.text.split("\|, \|")
	#print("write: " + str(moveti))
	writer.writerow(moveti)

	# print(n.node_id)
	# print(n.metadata['file_path'])
	#print(chosen_nodes)




	# counter =0
	# all_chunks = ""

	# for item in nodes2:
	# counter+=1
	# all_chunks += "CHUNK " + str(counter) + " ###################################### \n\n"
	# writer.writerow([str(counter), "All chunks", item.get_metadata_str(), "NA", item.get_content()])

	# all_chunks += item.get_metadata_str() + "\n\n"


	# all_chunks += "Text: \n" + item.get_content() + "\n\n\n\n"


	#print(all_chunks)



	print("cp aclp_ux_app")

	return response2,chosen_nodes, "your_sfw.csv" #,all_chunks


	def update_message(request: gr.Request):
	return f"Welcome, {request.username}"






	print("launching")

	if __name__ == "__main__":
	# freeze_support()
	# Process(target=startup).start()
	# Process(target=startup).start()

	startup()

	# with gr.Blocks() as demo:
	# m = gr.Markdown()
	# logout_button = gr.Button("Logout", link="/logout")
	# demo.load(update_message, None, m)

	#demo.launch(auth=[("user", "nathan")])
	demo.launch()
	print("launched")