Update Demo.py
Browse files
Demo.py
CHANGED
|
@@ -2,15 +2,12 @@ import streamlit as st
|
|
| 2 |
import sparknlp
|
| 3 |
import pandas as pd
|
| 4 |
import json
|
| 5 |
-
import os
|
| 6 |
|
| 7 |
from sparknlp.base import *
|
| 8 |
from sparknlp.annotator import *
|
| 9 |
from pyspark.ml import Pipeline
|
| 10 |
from sparknlp.pretrained import PretrainedPipeline
|
| 11 |
|
| 12 |
-
os.environ['PYSPARK_DRIVER_PYTHON'] = '/usr/bin/python3.8'
|
| 13 |
-
|
| 14 |
# Page configuration
|
| 15 |
st.set_page_config(
|
| 16 |
layout="wide",
|
|
@@ -40,13 +37,7 @@ st.markdown("""
|
|
| 40 |
|
| 41 |
@st.cache_resource
|
| 42 |
def init_spark():
|
| 43 |
-
|
| 44 |
-
spark = SparkSession.builder \
|
| 45 |
-
.appName("App") \
|
| 46 |
-
.config("spark.pyspark.python", "/usr/bin/python3.8") \
|
| 47 |
-
.config("spark.pyspark.driver.python", "/usr/bin/python3.8") \
|
| 48 |
-
.getOrCreate()
|
| 49 |
-
return spark
|
| 50 |
|
| 51 |
@st.cache_resource
|
| 52 |
def create_pipeline(model):
|
|
@@ -75,11 +66,20 @@ def create_pipeline(model):
|
|
| 75 |
pipeline = Pipeline(stages=[document_assembler, sentence_detector, table_assembler, tapas_wtq, tapas_sqa])
|
| 76 |
return pipeline
|
| 77 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 78 |
def fit_data(pipeline, json_data, question):
|
| 79 |
spark_df = spark.createDataFrame([[json_data, question]]).toDF("table_json", "questions")
|
| 80 |
model = pipeline.fit(spark_df)
|
| 81 |
-
|
| 82 |
-
|
|
|
|
|
|
|
| 83 |
|
| 84 |
# Sidebar content
|
| 85 |
model = st.sidebar.selectbox(
|
|
@@ -90,19 +90,19 @@ model = st.sidebar.selectbox(
|
|
| 90 |
|
| 91 |
# Set up the page layout
|
| 92 |
title = 'TAPAS for Table-Based Question Answering with Spark NLP'
|
| 93 |
-
sub_title = (
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
)
|
| 99 |
|
| 100 |
st.markdown(f'<div class="main-title">{title}</div>', unsafe_allow_html=True)
|
| 101 |
st.markdown(f'<div class="section"><p>{sub_title}</p></div>', unsafe_allow_html=True)
|
| 102 |
|
| 103 |
# Reference notebook link in sidebar
|
| 104 |
link = """
|
| 105 |
-
<a href="https://
|
| 106 |
<img src="https://colab.research.google.com/assets/colab-badge.svg" style="zoom: 1.3" alt="Open In Colab"/>
|
| 107 |
</a>
|
| 108 |
"""
|
|
@@ -149,7 +149,6 @@ queries = [
|
|
| 149 |
"How many billionaires are in the conglomerate industry?"
|
| 150 |
]
|
| 151 |
|
| 152 |
-
|
| 153 |
# Load the JSON data into a DataFrame and display it
|
| 154 |
table_data = json.loads(json_data)
|
| 155 |
df_table = pd.DataFrame(table_data["rows"], columns=table_data["header"])
|
|
@@ -180,5 +179,12 @@ output = fit_data(pipeline, table_json_str, text_to_analyze)
|
|
| 180 |
|
| 181 |
# Display the output
|
| 182 |
st.markdown("---")
|
| 183 |
-
st.subheader("Processed
|
| 184 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
import sparknlp
|
| 3 |
import pandas as pd
|
| 4 |
import json
|
|
|
|
| 5 |
|
| 6 |
from sparknlp.base import *
|
| 7 |
from sparknlp.annotator import *
|
| 8 |
from pyspark.ml import Pipeline
|
| 9 |
from sparknlp.pretrained import PretrainedPipeline
|
| 10 |
|
|
|
|
|
|
|
| 11 |
# Page configuration
|
| 12 |
st.set_page_config(
|
| 13 |
layout="wide",
|
|
|
|
| 37 |
|
| 38 |
@st.cache_resource
|
| 39 |
def init_spark():
|
| 40 |
+
return sparknlp.start()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
|
| 42 |
@st.cache_resource
|
| 43 |
def create_pipeline(model):
|
|
|
|
| 66 |
pipeline = Pipeline(stages=[document_assembler, sentence_detector, table_assembler, tapas_wtq, tapas_sqa])
|
| 67 |
return pipeline
|
| 68 |
|
| 69 |
+
def fit_data(pipeline, data):
|
| 70 |
+
empty_df = spark.createDataFrame([['']]).toDF('text')
|
| 71 |
+
pipeline_model = pipeline.fit(empty_df)
|
| 72 |
+
model = LightPipeline(pipeline_model)
|
| 73 |
+
result = model.fullAnnotate(data)
|
| 74 |
+
return result
|
| 75 |
+
|
| 76 |
def fit_data(pipeline, json_data, question):
|
| 77 |
spark_df = spark.createDataFrame([[json_data, question]]).toDF("table_json", "questions")
|
| 78 |
model = pipeline.fit(spark_df)
|
| 79 |
+
lightPipelineModel = LightPipeline(model)
|
| 80 |
+
result = lightPipelineModel.fullAnnotate(data)
|
| 81 |
+
st.write(result)
|
| 82 |
+
return result
|
| 83 |
|
| 84 |
# Sidebar content
|
| 85 |
model = st.sidebar.selectbox(
|
|
|
|
| 90 |
|
| 91 |
# Set up the page layout
|
| 92 |
title = 'TAPAS for Table-Based Question Answering with Spark NLP'
|
| 93 |
+
sub_title = ("""
|
| 94 |
+
TAPAS (Table Parsing Supervised via Pre-trained Language Models) enhances the BERT architecture to effectively process tabular data, allowing it to answer complex questions about tables without needing to convert them into text.<br>
|
| 95 |
+
<br>
|
| 96 |
+
<strong>table_qa_tapas_base_finetuned_wtq:</strong> This model excels at answering questions that require aggregating data across the entire table, such as calculating sums or averages.<br>
|
| 97 |
+
<strong>table_qa_tapas_base_finetuned_sqa:</strong> This model is designed for sequential question-answering tasks where the answer to each question may depend on the context provided by previous answers.
|
| 98 |
+
""")
|
| 99 |
|
| 100 |
st.markdown(f'<div class="main-title">{title}</div>', unsafe_allow_html=True)
|
| 101 |
st.markdown(f'<div class="section"><p>{sub_title}</p></div>', unsafe_allow_html=True)
|
| 102 |
|
| 103 |
# Reference notebook link in sidebar
|
| 104 |
link = """
|
| 105 |
+
<a href="https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/streamlit_notebooks/NER_HINDI_ENGLISH.ipynb">
|
| 106 |
<img src="https://colab.research.google.com/assets/colab-badge.svg" style="zoom: 1.3" alt="Open In Colab"/>
|
| 107 |
</a>
|
| 108 |
"""
|
|
|
|
| 149 |
"How many billionaires are in the conglomerate industry?"
|
| 150 |
]
|
| 151 |
|
|
|
|
| 152 |
# Load the JSON data into a DataFrame and display it
|
| 153 |
table_data = json.loads(json_data)
|
| 154 |
df_table = pd.DataFrame(table_data["rows"], columns=table_data["header"])
|
|
|
|
| 179 |
|
| 180 |
# Display the output
|
| 181 |
st.markdown("---")
|
| 182 |
+
st.subheader("Processed Output")
|
| 183 |
+
|
| 184 |
+
# # Check if output is available
|
| 185 |
+
# if output:
|
| 186 |
+
# results_wtq = output[0][0] if output[0][0] else "No results found."
|
| 187 |
+
# results_sqa = output[0][1] if output[0][1] else "No results found."
|
| 188 |
+
# st.markdown(f"**Answers from WTQ model:** {', '.join(results_wtq)}")
|
| 189 |
+
# st.markdown(f"**Answers from SQA model:** {', '.join(results_sqa)}")
|
| 190 |
+
|