jskinner215's picture
Update app.py
45ee012
raw
history blame
3.86 kB
from copy import deepcopy
import streamlit as st
import pandas as pd
from io import StringIO
from transformers import AutoTokenizer, AutoModelForTableQuestionAnswering
import numpy as np
# Initialize TAPAS model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("google/tapas-large-finetuned-wtq")
model = AutoModelForTableQuestionAnswering.from_pretrained("google/tapas-large-finetuned-wtq")
def ask_llm_chunk(chunk, questions):
chunk = chunk.astype(str)
try:
inputs = tokenizer(table=chunk, queries=questions, padding="max_length", truncation=True, return_tensors="pt")
except Exception as e:
st.write(f"An error occurred: {e}")
return ["Error occurred while tokenizing"] * len(questions)
if inputs["input_ids"].shape[1] > 512:
st.warning("Token limit exceeded for chunk")
return ["Token limit exceeded for chunk"] * len(questions)
outputs = model(**inputs)
predicted_answer_coordinates, predicted_aggregation_indices = tokenizer.convert_logits_to_predictions(
inputs,
outputs.logits.detach(),
outputs.logits_aggregation.detach()
)
answers = []
for coordinates in predicted_answer_coordinates:
if len(coordinates) == 1:
row, col = coordinates[0]
try:
st.write(f"DataFrame shape: {chunk.shape}") # Debugging line
st.write(f"DataFrame columns: {chunk.columns}") # Debugging line
st.write(f"Trying to access row {row}, col {col}") # Debugging line
value = chunk.iloc[row, col]
st.write(f"Value accessed: {value}") # Debugging line
answers.append(value)
except Exception as e:
st.write(f"An error occurred: {e}")
else:
cell_values = []
for coordinate in coordinates:
row, col = coordinate
try:
value = chunk.iloc[row, col]
cell_values.append(value)
except Exception as e:
st.write(f"An error occurred: {e}")
answers.append(", ".join(map(str, cell_values)))
return answers
MAX_ROWS_PER_CHUNK = 200
def summarize_map_reduce(data, questions):
dataframe = pd.read_csv(StringIO(data))
num_chunks = len(dataframe) // MAX_ROWS_PER_CHUNK + 1
dataframe_chunks = [deepcopy(chunk) for chunk in np.array_split(dataframe, num_chunks)]
all_answers = []
for chunk in dataframe_chunks:
chunk_answers = ask_llm_chunk(chunk, questions)
all_answers.extend(chunk_answers)
return all_answers
st.title("TAPAS Table Question Answering")
# Upload CSV data
csv_file = st.file_uploader("Upload a CSV file", type=["csv"])
if csv_file is not None:
data = csv_file.read().decode("utf-8")
st.write("CSV Data Preview:")
st.write(pd.read_csv(StringIO(data)).head())
# Input for questions
questions = st.text_area("Enter your questions (one per line)")
questions = questions.split("\n") # split questions by line
questions = [q for q in questions if q] # remove empty strings
if st.button("Submit"):
if data and questions:
answers = summarize_map_reduce(data, questions)
st.write("Answers:")
for q, a in zip(questions, answers):
st.write(f"Question: {q}")
st.write(f"Answer: {a}")
# Add Ctrl+Enter functionality for submitting the questions
st.markdown("""
<script>
document.addEventListener("DOMContentLoaded", function(event) {
document.addEventListener("keydown", function(event) {
if (event.ctrlKey && event.key === "Enter") {
document.querySelector(".stButton button").click();
}
});
});
</script>
""", unsafe_allow_html=True)