| import math |
| import numpy as np |
| import streamlit as st |
| from tomlkit import datetime |
| from engine import AnalyzerEngine |
| import pandas as pd |
| import datetime |
| import plotly.graph_objects as go |
| import os |
|
|
| COUNTER_FILE = "/data/visits.txt" |
|
|
| def get_and_increment_visits(): |
| |
| if not os.path.exists("/data"): |
| return "Non abilitato" |
| |
| |
| if os.path.exists(COUNTER_FILE): |
| with open(COUNTER_FILE, "r") as f: |
| content = f.read().strip() |
| visits = int(content) if content.isdigit() else 0 |
| else: |
| visits = 0 |
|
|
| |
| visits += 1 |
| with open(COUNTER_FILE, "w") as f: |
| f.write(str(visits)) |
| |
| return visits |
|
|
| |
| if 'visit_counted' not in st.session_state: |
| st.session_state.total_visits = get_and_increment_visits() |
| st.session_state.visit_counted = True |
| |
| st.set_page_config(page_title="Co\u00B3 Suite", layout="wide") |
| st.markdown(""" |
| <style> |
| /* Forza il contenitore dei tab ad andare a capo in automatico */ |
| div[data-baseweb="tab-list"] { |
| flex-wrap: wrap !important; |
| gap: 8px; /* Spazio orizzontale e verticale tra i tab */ |
| } |
| |
| /* Aggiunge un po' di margine per staccare visivamente le righe */ |
| button[data-baseweb="tab"] { |
| margin-bottom: 5px !important; |
| white-space: nowrap !important; /* Evita che il testo dentro il singolo tab vada a capo */ |
| } |
| </style> |
| """, unsafe_allow_html=True) |
| |
| @st.cache_resource |
| def load_full_engine(): |
| return AnalyzerEngine() |
|
|
| engine = load_full_engine() |
|
|
| |
| if 'res' not in st.session_state: |
| st.session_state.res = None |
|
|
| def prepare_export_data(res, fig_res=None): |
| rows = [] |
| |
| |
| categories = { |
| "Statistiche Base": res.get("basic", {}), |
| "Lessico": res.get("lexical", {}), |
| "Sintassi e Verbi": res.get("syntax", {}), |
| "Coesione": res.get("cohesion", {}), |
| "Consecutio": res.get("consecutio", {}), |
| "Combined": res.get("combined", {}) |
| } |
| |
| for cat_name, dict_data in categories.items(): |
| for key, value in dict_data.items(): |
| |
| if isinstance(value, (int, float, str)) and key not in ["texts", "conll", "doc"]: |
| rows.append({"Categoria": cat_name, "Metrica": key, "Valore": value}) |
| |
| |
| connectors = res.get("cohesion", {}).get("connectors", {}) |
| for conn_type, val in connectors.items(): |
| rows.append({"Categoria": "Connettori", "Metrica": f"Conn_{conn_type}", "Valore": val}) |
|
|
| |
| bert_metrics = ["mds_s", "mds_w", "total"] |
| for m in bert_metrics: |
| val = fig_res.get(m, 0) if fig_res else 0 |
| rows.append({"Categoria": "Figuratività (BERT)", "Metrica": m, "Valore": val}) |
| |
| return pd.DataFrame(rows) |
|
|
|
|
| def clear_all(): |
| st.session_state.res = None |
| st.session_state.input_text = "" |
|
|
| st.title("📝 Co\u00B3 Suite") |
| st.caption("Hybrid Text Analysis Dashboard with parameters for Coherence, Cohesion and Complexity evaluation powered by a custom-built engine and BERT.") |
|
|
| lang_choice = st.radio("Language:", ["English", "Italiano"], horizontal=True) |
| st.session_state.lang = "it" if lang_choice == "Italiano" else "en" |
|
|
|
|
| |
| uploaded_file = st.file_uploader("Upload a text file (.txt)", type=("txt")) |
| if uploaded_file is not None: |
| |
| stringio = uploaded_file.getvalue().decode("utf-8") |
| |
| st.session_state.input_text = stringio |
| if len(stringio) > 30000: |
| st.session_state.input_text = stringio[:30000] |
| st.session_state.source_name = uploaded_file.name.replace(".txt", "") |
| else: |
| |
| if st.session_state.get('input_text', "").strip(): |
| st.session_state.source_name = "CustomText" |
|
|
| txt = st.text_area("Insert Text (max 30,000 characters):", |
| height=200, |
| max_chars=30000, |
| key="input_text") |
| input_final = st.session_state.input_text if uploaded_file is None else stringio |
|
|
| col_btn1, col_btn2, _ = st.columns([1, 1, 4]) |
| with col_btn1: |
| if st.button("Start Analysis", type="primary"): |
| if len(input_final) > 30000: |
| input_text = st.session_state.input_text[:30000] |
| st.error(f"The text is too long ({len(input_final)} characters). The maximum limit is 30,000.") |
| if st.session_state.input_text.strip(): |
| st.session_state.pop("tab6_results", None) |
| with st.spinner("Analyzing..."): |
| full_res = engine.run(st.session_state.input_text,lang=st.session_state.lang) |
| |
| st.session_state.res = full_res |
| |
| st.session_state.nlp_doc = full_res["doc"] |
| else: |
| st.warning("Please insert some text.") |
| with col_btn2: |
| st.button("Clean Fields", on_click=clear_all) |
|
|
| |
| if st.session_state.res: |
| r = st.session_state.res |
| b, l, s, c , d = r["basic"], r["lexical"], r["syntax"], r["cohesion"], r['consecutio'] |
|
|
| t1, t2, t3, t4, t5, t6, t7 = st.tabs([ |
| "📊 Basical Statistics", |
| "📚 Lexical features ", |
| "🏗️ Sintax & Verbs", |
| "🧠 Cohesion & Connectives", |
| "🎭 Figurative Language", |
| "📈 Combined Metrics", |
| "📖 ILA Index" |
| ]) |
|
|
| |
| with t1: |
| st.subheader("Quantitative Basics") |
| c1, c2, c3, c4 = st.columns(4) |
| c1.metric("Total Tokens", b["tokens"]) |
| c2.metric("Total Sentences", b["sentences"]) |
| c3.metric("Total Paragraphs", b["paragraphs"]) |
| c4.metric("Hapax Legomena", l.get("hapax", 0), help="Number of words that appear only once in the text.") |
| c1.metric("Hapax Ratio", l.get("hapax_ratio", 0)) |
| c2.metric("Type-Token Ratio (TTR)", l.get("TTR", 0), help="Ratio of unique words (types) to total words (tokens).") |
| c3.metric("Gunning Fog Index", l.get("gunning_fog", 0), help="An estimate of the years of formal education needed to understand the text on a first reading. This metric is based on the average sentence length and the percentage of complex words (words with three or more syllables).") |
| c4.metric("HD-D Index", l.get("HD-D", 0), help="A measure of lexical diversity that accounts for the frequency of word usage. It is calculated as the ratio of the number of unique words (types) to the total number of words (tokens) raised to the power of 0.5. A higher HD-D index indicates greater lexical diversity, while a lower index suggests more repetition in word usage.") |
|
|
| st.divider() |
| st.subheader("Lexical Range Metrics and Concreteness") |
| col_r1, col_r2, col_r3, col_r4 = st.columns(4) |
| col_r1.metric("Lexical Range 1", l.get("r1", 0),help="Lexical Range 1: Percentage of words that are among the 1000 most common words in the language according to the dictionaries of the Nation’s Range program. A lower percentage indicates a wider lexical range, while a higher percentage suggests a more limited vocabulary.") |
| col_r2.metric("Lexical Range 2", l.get("r2", 0), help="Lexical Range 2: Percentage of words that are among the 2000 most common words in the language according to the dictionaries of the Nation’s Range program. ") |
| col_r3.metric("Lexical Range 3", l.get("r3", 0), help="Lexical Range 3: Percentage of words percentage of words belonging to \"The Academic Word List\"") |
| col_r4.metric("Concreteness (MRC)", round(l.get("concreteness", 0), 2), help="Average concreteness rating of the words in the text based on the MRC Psycholinguistic Database. Concreteness ratings range from 100 (very abstract) to 700 (very concrete). A higher average concreteness score indicates that the text contains more concrete and tangible words, while a lower score suggests a more abstract vocabulary.") |
|
|
| st.divider() |
| st.subheader("🔗 Bigrams & Trigrams (PMI)") |
|
|
| |
| if "nlp_doc" in st.session_state: |
| current_doc = st.session_state.nlp_doc |
| all_lemmas = [t.lemma_.lower() for t in current_doc if not t.is_punct and not t.is_space] |
| |
| c_bi, c_tri = st.columns(2) |
| with c_bi: |
| df_bi = engine.freq_mod.get_pmi(all_lemmas, n=2) |
| st.caption("Top Bigrams") |
| st.dataframe(df_bi, use_container_width=True, hide_index=True) |
| if not df_bi.empty: |
| st.download_button("📥 Download Bigrams", df_bi.to_csv(index=False).encode('utf-8'), "bigrams.csv", "text/csv", key="dl_bi") |
| with c_tri: |
| df_tri = engine.freq_mod.get_pmi(all_lemmas, n=3) |
| st.caption("Top Trigrams") |
| st.dataframe(df_tri, use_container_width=True, hide_index=True) |
| if not df_tri.empty: |
| st.download_button("📥 Download Trigrams", df_tri.to_csv(index=False).encode('utf-8'), "trigrams.csv", "text/csv", key="dl_tri") |
|
|
| st.divider() |
| st.subheader("📈 Frequencies & TF-IDF Trends") |
| |
| |
| col_split1, col_split2 = st.columns([1, 3]) |
| split_mode = col_split1.radio("Split Method:", ["numeric", "regex"]) |
| if split_mode == "numeric": |
| split_val = col_split2.number_input("Number of parts:", min_value=2, max_value=20, value=5) |
| else: |
| split_val = col_split2.text_input("Regex pattern (e.g. \\n\\n for paragraphs):", value="\n\n") |
|
|
| |
| chunks = engine.freq_mod.chunk_doc(st.session_state.input_text, current_doc, mode=split_mode, val=split_val) |
| df_freq, df_tfidf = engine.freq_mod.get_trends(chunks) |
|
|
| if not df_freq.empty: |
| |
| top_overall = df_freq.groupby("Word")["Count"].sum().nlargest(5).index.tolist() |
| |
| selected_words = st.multiselect("Select words to plot:", |
| options=df_freq["Word"].unique(), |
| default=top_overall) |
| |
| if selected_words: |
| plot_data_freq = df_freq[df_freq["Word"].isin(selected_words)] |
| plot_data_tfidf = df_tfidf[df_tfidf["Word"].isin(selected_words)] |
|
|
| |
| fig_f = go.Figure() |
| for word in selected_words: |
| w_data = plot_data_freq[plot_data_freq["Word"] == word] |
| fig_f.add_trace(go.Scatter(x=w_data["Part"], y=w_data["Count"], mode='lines+markers', name=word)) |
| fig_f.update_layout(title="Absolute Frequencies per Chunk", xaxis_title="Chunk", yaxis_title="Count") |
| |
| |
| fig_t = go.Figure() |
| for word in selected_words: |
| w_data = plot_data_tfidf[plot_data_tfidf["Word"] == word] |
| fig_t.add_trace(go.Scatter(x=w_data["Part"], y=w_data["TF-IDF"], mode='lines+markers', name=word, line=dict(dash='dot'))) |
| fig_t.update_layout(title="TF-IDF per Chunk", xaxis_title="Chunk", yaxis_title="TF-IDF Score") |
|
|
| st.plotly_chart(fig_f, use_container_width=True) |
| st.plotly_chart(fig_t, use_container_width=True) |
| |
| st.divider() |
| col_dl1, col_dl2 = st.columns(2) |
| col_dl1.download_button( |
| "📥 Download Complete Frequencies", |
| df_freq.to_csv(index=False).encode('utf-8'), |
| "frequecies.csv", "text/csv", key="dl_freq" |
| ) |
| col_dl2.download_button( |
| "📥 Scarica TF-IDF Completo", |
| df_tfidf.to_csv(index=False).encode('utf-8'), |
| "tfidf.csv", "text/csv", key="dl_tfidf" |
| ) |
| else: |
| st.info("Select at least one word to display the charts.") |
| |
| |
| with t2: |
| st.subheader("Pronouns and Nouns") |
| l1, l2, l3, l4 = st.columns(4) |
| l1.metric("Pronouns", l.get("pronouns", 0)) |
| l2.metric("Nouns", l.get("nouns", 0)) |
| l3.metric("Ratio Pronouns/Nouns", l.get("pron_noun_ratio", 0)) |
| l4.metric("First Person Pronouns", l.get("first_person_ratio", 0)) |
| l1.metric("Modifiers per Noun", s.get("mod_per_noun", 0),help="Average number of modifiers (adjectives, relative clauses, etc.) per noun in the text. A higher value indicates a more descriptive and detailed use of nouns, while a lower value suggests a simpler noun usage.") |
| |
|
|
| st.divider() |
| st.subheader("Adjectives and Emphatic particles") |
| d1, d2, d3, d4 = st.columns(4) |
| d1.metric("Deictics", l.get("deictics", 0)) |
| d2.metric("Adjectives", s.get("adj_count", 0)) |
| d3.metric("Adj per Sentence", s.get("adj_x_sent", 0)) |
| d4.metric("Emphatic Particles", l.get("emphatic_particles", 0)) |
| d1.metric("Deictic/Articles", l.get("deictic_Frequency", 0)) |
| d2.metric("attributive/Adj Ratio", s.get("attr_adjs_ratio", 0)) |
| d3.metric("attributive Adj Frequency", s.get("attr_adjs_freq", 0)) |
| |
| st.divider() |
| st.subheader("Articles") |
| j1, j2, j3, j4 = st.columns(4) |
| j1.metric("Articles", l.get("articles", 0)) |
| j2.metric("Definite Articles", l.get("definite_articles", 0)) |
| j3.metric("Demonstratives articles", l.get("demonstratives", 0)) |
| j3.metric("Demonstratives per sentence", l.get("demonstratives_ratio", 0)) |
| j2.metric("Definite articles per sentence", l.get("definite_articles_ratio", 0)) |
| |
| |
| |
| with t3: |
| st.subheader("Syntactic Measures") |
| s1, s2, s3, s4 = st.columns(4) |
| s1.metric("Average Sentence Length", s.get("avg_sent_len", 0)) |
| s2.metric("Subordinate per Sentence", s.get("sub_ratio", 0)) |
| s3.metric("Relatives per Sentence", s.get("rel_clauses_per_sent", 0)) |
| s4.metric("Distance from Root", s.get("root_dist", 0),help="Average distance from the root of the dependency tree to the other nodes. A higher value indicates a more complex syntactic structure, while a lower value suggests a simpler structure.") |
| s1.metric("Hypotactic depth", d.get("avg_depth", 0),help="Average depth of the dependency tree.") |
| s2.metric("sentence depth variance", round(d.get("sentence_depths", {}), 4), help="Variance of the depths of the dependency trees across sentences. A higher variance indicates greater variability in sentence complexity, while a lower variance suggests more uniformity in sentence structure.") |
| s1.metric("Punctuation Pairs per Sentence", s.get("punct_pairs_per_sent", 0),help="Number of punctuation pairs (e.g., parentheses, quotes) per sentence.") |
| s2.metric("Subj-Verb_Obj Inversions per sentence", s.get("svo_inversions_per_sent", 0),help="Number of subj-verb-obj inversions per sentence.") |
| s3.metric("Number of Subordinate Clauses (completive excluded)", s.get("non_comp_sub_per_sent", 0),help="Number of non-completive subordinate clauses per sentence.") |
| st.divider() |
|
|
| st.subheader("Verbs & Tenses") |
| v1, v2, v3, v4 = st.columns(4) |
| v1.metric("Ratio Present/Verbs", s.get("present_ratio", 0)) |
| v2.metric("Ratio Past/Verbs", s.get("past_ratio", 0)) |
| v3.metric("Ratio Participles/Verbs", s.get("participle_ratio", 0)) |
| v4.metric("Consecutio Index", d.get("consecutio_index", 0),help="Index measuring the sequential relationship between clauses.") |
| v1.metric("Temporal Stability", f"{d.get('tense_stability', 0)}", help="Percentage of verbs that maintain the same tense across the text. A higher percentage indicates greater temporal stability, while a lower percentage suggests more frequent tense shifts.") |
| v2.metric("Verbal Density", f"{d.get('verb_density', 0)}",help="Percentage of words that are verbs in the text. A higher percentage indicates a more verb-heavy text, while a lower percentage suggests a less verb-heavy text.") |
|
|
| st.divider() |
| st.subheader("Dependency Tree Inspection (CoNLL-U)") |
| |
| |
| idx = st.selectbox( |
| "Select a sentence to inspect:", |
| range(len(b["texts"])), |
| format_func=lambda i: f"Frase {i+1}: {b['texts'][i][:70]}..." |
| ) |
| |
| |
| st.text_area( |
| label="CoNLL-U Format (Tab-Separated):", |
| value=b["conll"][idx], |
| height=300 |
| ) |
|
|
| |
| with t4: |
| st.subheader("Textual Cohesion Metrics") |
| m1, m2, m3, m4 = st.columns(4) |
| m1.metric("Lemma Overlap (adjacent)", f"{c.get('lexical_cohesion_local', 0)*100:.2f}%",help="Percentage of lemmas that are shared between adjacent paragraphs. A higher percentage indicates stronger local cohesion, while a lower percentage suggests weaker local cohesion.") |
| m2.metric("Lemma Overlap (3 paragraphs)", f"{c.get('lexical_cohesion_global', 0)*100:.2f}%",help="Percentage of lemmas that are shared between sentences in different paragraphs. ") |
| m3.metric("Semantic Overlap (Sentences)", f"{c.get('semantic_cohesion_sentences', 0)*100:.2f}%",help="Percentage of semantic relationships that are shared between adjacent sentences. This metric is calculated by using BERT to identify similarities in meaning between sentences. ") |
| m4.metric("Semantic Overlap (Paragraphs)", f"{c.get('semantic_cohesion_paragraphs', 0)*100:.2f}%",help="Percentage of semantic relationships that are shared between sentences in different paragraphs. ") |
|
|
| st.divider() |
| st.subheader("Connector Frequency (Normalized)") |
| |
| |
| conn_data = c.get("connectors", {}) |
| col_c1, col_c2 = st.columns(2) |
| with col_c1: |
| st.bar_chart(pd.Series({ |
| "Adictives +": conn_data.get("AdPos", 0), |
| "Adictives -": conn_data.get("AdNeg", 0), |
| "Causals +": conn_data.get("CausPos", 0), |
| "Causals -": conn_data.get("CausNeg", 0) |
| })) |
| with col_c2: |
| st.bar_chart(pd.Series({ |
| "Temporals +": conn_data.get("TempPos", 0), |
| "Temporals -": conn_data.get("TempNeg", 0), |
| "Logics +": conn_data.get("LogPos", 0), |
| "Logics -": conn_data.get("LogNeg", 0) |
| })) |
| |
|
|
| st.info(f"Cohesion Value\n\n{c.get('general_cohesion', 0)}\n\nThe logarithm of the standard deviation, using the weighted sum of frequencies as the base.") |
|
|
| |
| with t5: |
| st.header("🎭 Figurative Language Analysis (BERT V5)", help="The analysis may take some time to complete. Large texts may require several minutes.") |
| |
| if "nlp_doc" not in st.session_state: |
| st.warning("Before proceeding, run the general analysis in Tab 1.") |
| else: |
| |
| col_btn, col_chk, col_sld = st.columns([1.5, 1, 2], gap="medium") |
|
|
| with col_chk: |
| use_sampling = st.checkbox("Sample mode", value=True, help="Analyze only a random portion of the text.") |
|
|
| with col_sld: |
| sample_rate = 1.0 |
| if use_sampling: |
| sample_rate = st.slider("Sample Size %", 5, 95, 10, 5) / 100 |
|
|
| with col_btn: |
| |
| launch = st.button("Launch Metaphor Detector", type="secondary", use_container_width=True) |
|
|
| if launch: |
| with st.spinner("BERT is executing the Masked Language Modeling..."): |
| lang = st.session_state.get("lang", "en") |
| res = engine.fig_mod.analyze(st.session_state.nlp_doc, |
| sample_rate=sample_rate, |
| lang=lang) |
| st.session_state.fig_results_data = res |
| if "fig_results_data" in st.session_state: |
| res = st.session_state.fig_results_data |
| st.subheader("MDS indices (Metaphor Density Score)") |
| c1, c2, c3 = st.columns(3) |
| c1.metric("MDS-S (per Sentence)", round(res["mds_s"], 4)) |
| st.subheader("MDS indices (Metaphor Density Score)") |
| c1, c2, c3 = st.columns(3) |
| c1.metric("MDS-S (per Sentence)", round(res["mds_s"], 4),help="Metaphor Density Score per Sentence: This metric calculates the average number of metaphors per sentence in the text. Metaphor detection scans syntactic pairs (Subj-Verb, Obj-Verb, Noun-Adj) by masking terms and using BERT to predict contextual expectations; it flags a metaphor when the semantic similarity between the original word (neutralized via person/thing placeholders) and BERT's top candidates falls below a 0.90 threshold.") |
| c2.metric("MDS-W (per 1k Words)", round(res["mds_w"], 2),help="This metric calculates the average number of metaphors per 1,000 words in the text.") |
| c3.metric("Total Metaphors", res["total"],help="Total number of metaphors detected in the text.") |
| if res.get("is_sample"): |
| st.caption(f"⚠️ Note: These scores are estimated based on a {sample_rate*100:.0f}% random sample of the text.") |
| st.divider() |
| st.subheader("🔍 Retrieved semantic anomalies") |
| |
| if not res["detections"]: |
| st.info("No semantic anomalies detected with the current thresholds.") |
| else: |
| for d in res["detections"]: |
| |
| color = "red" if d['probability'] > 85 else "orange" if d['probability'] > 70 else "blue" |
| with st.expander(f":{color}[{d['term']} ↔ {d['head']}] - Probabilità: {d['probability']}%"): |
| st.write(f"**Context:** _{d['sentence']}_") |
| st.caption(f"Logic: {d['reason']} | s1: {d['s1']} | s2: {d['s2']}") |
|
|
| |
| with t6: |
| st.subheader("📈 Combined Metrics & Positioning") |
| if st.session_state.get("lang", "en") == "it": |
| st.warning( |
| "⚠️ **Combined Metrics not available for Italian Language.**\n\n" |
| "The reference database is in English. " |
| "The Italian module is under developement." |
| ) |
| else: |
| if st.session_state.res: |
| fig_results = st.session_state.get("fig_results_data", None) |
| if not fig_results: |
| st.info("💡 **Caution:** the figurative metrics (BERT) have not been calculated. The model will automatically exclude them from comparison with historical texts.") |
| |
| |
| if st.button("Calculate Positioning and Quality Score", type="primary"): |
| with st.spinner("Clustering and Correlation Calculation in Progress..."): |
| try: |
| source_name = st.session_state.get("source_name", "CustomText") |
| |
| |
| q_score, fig_clustering, classe_assegnata = engine.run_combined_analysis( |
| res=st.session_state.res, |
| fig_res=fig_results, |
| source_name=source_name, |
| ) |
| |
| st.session_state.tab6_results = (q_score, fig_clustering, classe_assegnata) |
| except FileNotFoundError: |
| st.warning("Historical database ('database_completo_largo.pkl') not found.") |
| except Exception as e: |
| st.error(f"Error occurred during combined analysis: {e}") |
| |
| |
| if "tab6_results" in st.session_state: |
| q_score, fig_clustering, classe_assegnata = st.session_state.tab6_results |
| |
| |
| col_score, col_desc = st.columns([1, 2]) |
| |
| with col_score: |
| |
| st.metric("Quality Score", f"{q_score:.3f}") |
| |
| with col_desc: |
| if q_score >= 3: |
| st.success("🌟 Excellent!") |
| elif q_score >= 2: |
| st.info("📊 Good: Balanced style, tending towards quality works.") |
| elif q_score < 2 and q_score >= -15: |
| st.warning("⚠️ Fair: Common traits with consumer or debut literature.") |
| elif q_score < 0.5: |
| st.error("📉 Poor: Stylistically close to amateur or basic works.") |
| |
| with st.expander("ℹ️ How is this score interpreted?"): |
| st.markdown(""" |
| The score is calculated by measuring the stylistic correlation (Pearson) of your text against 4 predefined classes of works, penalizing similarity to amateur texts. |
| |
| * **theoric range**: from **0** to **4** |
| * **Towards 3.0**: Maximum affinity with Great Classics/Masterpieces and Great Bestsellers. |
| * **Around 2.0**: Neutral or hybrid style. |
| * **Towards 0.0**: Maximum affinity with Amateur Works and Basic Genre Literature. |
| """) |
| |
| st.info(f"🏷️ **Predicted Class:**\n\n{classe_assegnata}") |
| st.divider() |
| st.plotly_chart(fig_clustering, use_container_width=True) |
| |
| else: |
| st.info("Perform the basic analysis in Tab 1 first.") |
|
|
|
|
| |
| with t7: |
| st.subheader("📖 Index of Automatic Readability (ILA)") |
| |
| |
| lr1 = l.get("r1", 0) |
| lr2 = l.get("r2", 0) |
| lr3 = l.get("r3", 0) |
| |
| punct = s.get("punct_pairs_per_sent", 0) |
| svo = s.get("svo_inversions_per_sent", 0) |
| non_comp = s.get("non_comp_sub_per_sent", 0) |
| |
| |
| somma_lr = lr1 + lr2 + lr3 |
| comp_lessicale = math.log(((lr1 * 2) + lr2 + lr3) / somma_lr if somma_lr > 0 else 0.0) |
| |
| |
| comp_sintattica = (punct + svo + non_comp) |
| |
| ila_score = comp_lessicale - comp_sintattica |
|
|
| |
| col_il1, col_il2, col_il3 = st.columns(3) |
| col_il1.metric("Lexical Readability", f"{comp_lessicale:.4f}", help="(Log((LR1 * 2) + LR2 + LR3) / Somma LR)") |
| col_il2.metric("Sytactic Complexity", f"- {comp_sintattica:.4f}", help="Sum of: Punctuation Pairs per Sentence + SVO Inversions per Sentence + Non-completive Subordinate Clauses per Sentence") |
| col_il3.metric("Final ILA Score", f"{ila_score:.4f}") |
| |
| st.info("💡 ** How to Interpret the ILA Index:** A higher value indicates greater readability and fluency. The score rewards the use of high-frequency vocabulary (LR1) and penalizes syntactic complexity (inversions, parentheses, and complex subordinate clauses).") |
| |
| st.divider() |
|
|
| |
| st.subheader("📊 Comparison of ILA with Reference Works") |
| |
| |
| books = ["Ulysses (Part III)<br>(Hard)", "The Hobbit<br>(Med. Hard)", "The Little Prince<br>(Med. Easy)", "Isodora Moon goes to School<br>(Easy)", "📍 Your Text"] |
| lex_vals = [0.63, 0.64, 0.65, 0.64, comp_lessicale] |
| syn_vals = [-2.73, -1.29, -0.87, -0.53, -comp_sintattica] |
| ila_vals = [-2.1, -0.65, -0.23, 0.11, ila_score] |
|
|
| fig_ila = go.Figure() |
| |
| |
| fig_ila.add_trace(go.Bar(name='Lexical Readability (+)', x=books, y=lex_vals, marker_color='#00cc96')) |
| fig_ila.add_trace(go.Bar(name='Sytactic Complexity (-)', x=books, y=syn_vals, marker_color='#ff4b4b')) |
| |
| |
| fig_ila.add_trace(go.Scatter( |
| name='ILA Score', |
| x=books, y=ila_vals, |
| mode='lines+markers+text', |
| text=[f"{v:.2f}" for v in ila_vals], |
| textposition="top center", |
| textfont=dict(color="black", size=12), |
| marker=dict(color='white', size=10, line=dict(color='black', width=2)), |
| line=dict(color='black', width=2) |
| )) |
| |
| |
| fig_ila.update_layout( |
| barmode='relative', |
| height=400, |
| margin=dict(l=20, r=20, t=40, b=20), |
| legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1) |
| ) |
| |
| st.plotly_chart(fig_ila, use_container_width=True) |
| |
| st.divider() |
|
|
| |
| |
| |
| st.subheader("📊 Comparison of Standard Indices") |
| |
| |
| hdd = l.get("HD-D", 0) |
| gunning_fog = l.get("gunning_fog", 0) |
| |
| |
| tokens_count = b.get("tokens", 1) |
| sents_count = b.get("sentences", 1) |
| chars_count = b.get("chars", 1) |
| letters_count = chars_count - tokens_count |
| |
| gulpease = 89 + ((300 * sents_count) - (10 * letters_count)) / tokens_count if tokens_count > 0 else 0 |
| gulpease = max(0, min(100, gulpease)) |
| |
| |
| |
| fig_gulp = go.Figure(go.Indicator( |
| mode = "gauge+number", |
| value = gulpease, |
| title = {'text': "Gulpease<br><span style='font-size:0.8em;color:gray'>Higher = Easier</span>"}, |
| gauge = { |
| 'axis': {'range': [0, 100]}, |
| 'bar': {'color': "rgba(0,0,0,0.5)"}, |
| 'steps' : [ |
| {'range': [0, 40], 'color': "#ff4b4b"}, |
| {'range': [40, 60], 'color': "#ffa500"}, |
| {'range': [60, 100], 'color': "#00cc96"} |
| ]} |
| )) |
| |
| |
| fig_fog = go.Figure(go.Indicator( |
| mode = "gauge+number", |
| value = gunning_fog, |
| title = {'text': "Gunning Fog<br><span style='font-size:0.8em;color:gray'>Higher = Harder</span>"}, |
| gauge = { |
| 'axis': {'range': [0, 25]}, |
| 'bar': {'color': "rgba(0,0,0,0.5)"}, |
| 'steps' : [ |
| {'range': [0, 9], 'color': "#00cc96"}, |
| {'range': [9, 14], 'color': "#ffa500"}, |
| {'range': [14, 25], 'color': "#ff4b4b"} |
| ]} |
| )) |
| |
| |
| fig_hdd = go.Figure(go.Indicator( |
| mode = "gauge+number", |
| value = hdd, |
| title = {'text': "Diversity (HD-D)<br><span style='font-size:0.8em;color:gray'>Higher = More Diverse</span>"}, |
| gauge = { |
| 'axis': {'range': [0, 50]}, |
| 'bar': {'color': "rgba(0,0,0,0.5)"}, |
| 'steps' : [ |
| {'range': [0, 30], 'color': "#ff4b4b"}, |
| {'range': [30, 40], 'color': "#ffa500"}, |
| {'range': [40, 50], 'color': "#00cc96"} |
| ]} |
| )) |
|
|
| |
| for fig in [fig_gulp, fig_fog, fig_hdd]: |
| fig.update_layout(height=260, margin=dict(l=20, r=20, t=90, b=20)) |
|
|
| |
| cg1, cg2, cg3 = st.columns(3) |
| with cg1: |
| st.plotly_chart(fig_gulp, use_container_width=True) |
| with cg2: |
| st.plotly_chart(fig_fog, use_container_width=True) |
| with cg3: |
| st.plotly_chart(fig_hdd, use_container_width=True) |
| |
|
|
| st.divider() |
| if st.session_state.res: |
| |
| fig_results = st.session_state.get("fig_results_data", None) |
| |
| |
| df_export = prepare_export_data(st.session_state.res, fig_results) |
|
|
| base_name = st.session_state.get("source_name", "CustomText") |
| timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M") |
| csv_name = f"Co3_{base_name}_{timestamp}.csv" |
| csv_data = df_export.to_csv(index=False).encode('utf-8') |
|
|
| st.download_button( |
| label="📥 Export Results in CSV", |
| data=csv_data, |
| file_name=csv_name, |
| mime='text/csv' |
| ) |
| visite = st.session_state.get("total_visits", 0) |
| st.caption("Co\u00B3 Suite | {visite}") |