Spaces:

ner4archives
/

NER4Archives-analytics

Sleeping

App Files Files Community

NER4Archives-analytics / n4a_analytics_lib /st_components.py

lterriel

clean & refactor components + add doc

74e2066 over 3 years ago

raw

history blame contribute delete

7.27 kB

	# -- coding:utf-8 --

	import io

	import pandas
	import streamlit as st
	from pycaprio import Pycaprio, mappings
	from zipfile import ZipFile
	from requests.exceptions import JSONDecodeError

	from n4a_analytics_lib.analytics import (GlobalStatistics,
	IaaStatistics)
	from n4a_analytics_lib.constants import KAPPA_LEGEND


	@st.cache
	def convert_df(df_ex: pandas.DataFrame) -> bytes:
	return df_ex.to_csv(encoding="utf-8").encode('utf-8')


	def check_login(username: str, password: str) -> bool:
	if (len(username) == 0) or (len(password) == 0):
	return False
	return True


	def display_data(col: st.columns) -> None:
	col.metric("Total curated annotations",
	f"{st.session_state['gs_obj'].total_annotations_project} Named entities")
	col.dataframe(st.session_state['gs_obj'].df_i)
	selected_data = col.selectbox('Select specific data to display bar plot:',
	st.session_state['gs_obj'].documents, key="selector_data")
	col.pyplot(st.session_state['gs_obj'].create_plot(selected_data))


	def template_agreement_dataframe(title: str,
	df: pandas.DataFrame,
	total_pov: int,
	total_annotations: int,
	percentage_pov: float,
	mode: str) -> None:
	st.subheader(title)
	st.markdown(f"{total_pov} / {total_annotations} annotations ({percentage_pov} %)")
	st.download_button(
	"Press to Download CSV",
	convert_df(df),
	f"csv_annotators_{mode}.csv",
	"text/csv",
	key=f'download-csv_{mode}'
	)
	st.dataframe(df)


	def init_session_iaa(data: st.file_uploader,
	baseline: st.file_uploader,
	col: st.columns) -> None:
	project_analyzed = IaaStatistics(zip_project=data, baseline_text=baseline.getvalue())
	baseline_analyzer = project_analyzed.analyze_text()

	col.markdown(f"""
	### BASELINE TEXT: {baseline.name}

	- sentences: {baseline_analyzer[0]}
	- words: {baseline_analyzer[1]}
	- characters: {baseline_analyzer[2]}
	""")

	st.markdown("## 📈 IAA metrics")
	col1_kappa, col2_kappa = st.columns(2)

	# Display Kappa group
	col1_kappa.subheader("Fleiss Kappa (global score for group):")
	col1_kappa.markdown(interpret_kappa(project_analyzed.fleiss_kappa), unsafe_allow_html=True)

	# Display pairs kappa
	col1_kappa.subheader("Cohen Kappa (score for annotators pair):")
	for coders, c_k in project_analyzed.compute_pairs_cohen_kappa().items():
	col1_kappa.markdown(f"* {coders[0]} <> {coders[1]} : {interpret_kappa(c_k)}", unsafe_allow_html=True)

	# Display Kappa legend
	col2_kappa.markdown(KAPPA_LEGEND, unsafe_allow_html=True)

	# Plot confusion matrix
	if st.checkbox('Display confusion matrix'):
	width = st.slider("matrix width", 1, 10, 14)
	height = st.slider("matrix height", 1, 10, 4)
	st.pyplot(project_analyzed.plot_confusion_matrix(width=width, height=height).figure)

	# Agree CSV
	template_agreement_dataframe(title="✅️ Agree annotations",
	df=project_analyzed.df_agree,
	total_pov=project_analyzed.total_agree,
	total_annotations=project_analyzed.total_annotations,
	percentage_pov=project_analyzed.percent_agree,
	mode="agree")
	# Disagree CSV
	template_agreement_dataframe(title="❌ Disagree annotations",
	df=project_analyzed.df_disagree,
	total_pov=project_analyzed.total_disagree,
	total_annotations=project_analyzed.total_annotations,
	percentage_pov=project_analyzed.percent_disagree,
	mode="disagree")
	# Pie plot
	st.subheader("🏷️ Global Labels Statistics")
	st.pyplot(project_analyzed.plot_agreement_pies().figure)


	def init_session_statistics(remote: bool, local: bool, data: tuple) -> None:
	# clear session
	st.session_state = {}

	# create a session variable
	st.session_state["gs_local"] = local
	st.session_state["gs_remote"] = remote

	# create a new object:
	# if remote fetch data from API Host first
	if remote and not(local):
	st.success('Fetch curated documents from host INCEpTION API in progress...')
	try:
	fetch_curated_data_from_remote(
	username=data[0],
	password=data[1]
	)
	except JSONDecodeError:
	# username / password incorrect
	st.error('Username or Password is incorrect please retry.')
	st.session_state = {}

	if local and not(remote):
	st.session_state["gs_obj"] = GlobalStatistics(zip_project=data, remote=False)


	def fetch_curated_data_from_remote(username: str,
	password: str,
	endpoint: str = "https://inception.dhlab.epfl.ch/prod",
	project_title: str = "ner4archives-template") -> None:
	# open a client
	client = Pycaprio(inception_host=endpoint, authentication=(str(username), str(password)))

	# get project object
	project_name = [p for p in client.api.projects() if p.project_name == project_title]

	# get all documents from project
	documents = client.api.documents(project_name[0].project_id)

	curations = []
	zipfiles = []
	count = 0
	flag = "a"
	# iterate over all documents and retrieve only curated into ZIP container
	for document in documents:
	if count > 0:
	flag = "r"
	if document.document_state == mappings.DocumentState.CURATION_COMPLETE:
	curated_content = client.api.curation(project_name[0].project_id, document,
	curation_format=mappings.InceptionFormat.UIMA_CAS_XMI_XML_1_1)
	curations.append(curated_content)
	for curation in curations:
	z = ZipFile(io.BytesIO(curation), mode=flag)
	zipfiles.append(z)

	count += 1

	# Merge all zip in one
	with zipfiles[0] as z1:
	for fname in zipfiles[1:]:
	zf = fname
	# print(zf.namelist())
	for n in zf.namelist():
	if n not in z1.namelist():
	z1.writestr(n, zf.open(n).read())

	# Create a new object
	st.session_state["gs_obj"] = GlobalStatistics(zip_project=z1, remote=True)


	def interpret_kappa(score: float) -> str:
	color = ""
	if score < 0:
	color= "#e74c3c;"
	elif 0.01 <= score <= 0.20:
	color= "#f39c12;"
	elif 0.21 <= score <= 0.40:
	color= "#f4d03f;"
	elif 0.41 <= score <= 0.60:
	color= "#5dade2;"
	elif 0.61 <= score <= 0.80:
	color= "#58d68d;"
	elif 0.81 <= score <= 0.99:
	color= "#28b463;"

	return f"<span style='font-size:30px; color: {color}'>{round(score*100, 2)} %</span>"