Spaces:

jskinner215
/

TAPAS_WTQ_Chunking

Build error

App Files Files Community

TAPAS_WTQ_Chunking / app.py

jskinner215

Update app.py

dfcdc4f over 2 years ago

raw

history blame

5.73 kB

	from copy import deepcopy
	import streamlit as st
	import pandas as pd
	from io import StringIO
	from transformers import AutoTokenizer, AutoModelForTableQuestionAnswering
	import numpy as np
	import weaviate
	from weaviate.embedded import EmbeddedOptions
	from weaviate import Client, ObjectsBatchRequest

	# Initialize TAPAS model and tokenizer
	tokenizer = AutoTokenizer.from_pretrained("google/tapas-large-finetuned-wtq")
	model = AutoModelForTableQuestionAnswering.from_pretrained("google/tapas-large-finetuned-wtq")

	# Initialize Weaviate client for the embedded instance
	client = weaviate.Client(
	embedded_options=EmbeddedOptions()
	)

	def ingest_data_to_weaviate(dataframe, class_name, class_description):
	properties = []
	for column in dataframe.columns:
	data_type = "string"
	if dataframe[column].dtype == "float64":
	data_type = "float"
	elif dataframe[column].dtype == "int64":
	data_type = "int"
	properties.append({
	"name": column,
	"description": column,
	"dataType": [data_type]
	})

	schema = {
	"classes": [
	{
	"class": class_name,
	"description": class_description,
	"properties": properties
	}
	]
	}

	# Create Schema in Weaviate
	client.schema.create(schema)

	# Ingest Data
	batch_request = weaviate.ObjectsBatchRequest()
	for _, row in dataframe.iterrows():
	obj = {
	"class": class_name,
	"properties": row.to_dict()
	}
	batch_request.add(obj)
	client.batch.create(batch_request)

	def query_weaviate(question):
	# This is a basic example; adapt the query based on the question
	results = client.query.get(class_name).with_near_text(question).do()
	return results

	def ask_llm_chunk(chunk, questions):
	chunk = chunk.astype(str)
	try:
	inputs = tokenizer(table=chunk, queries=questions, padding="max_length", truncation=True, return_tensors="pt")
	except Exception as e:
	st.write(f"An error occurred: {e}")
	return ["Error occurred while tokenizing"] * len(questions)

	if inputs["input_ids"].shape[1] > 512:
	st.warning("Token limit exceeded for chunk")
	return ["Token limit exceeded for chunk"] * len(questions)

	outputs = model(**inputs)
	predicted_answer_coordinates, predicted_aggregation_indices = tokenizer.convert_logits_to_predictions(
	inputs,
	outputs.logits.detach(),
	outputs.logits_aggregation.detach()
	)

	answers = []
	for coordinates in predicted_answer_coordinates:
	if len(coordinates) == 1:
	row, col = coordinates[0]
	try:
	st.write(f"DataFrame shape: {chunk.shape}") # Debugging line
	st.write(f"DataFrame columns: {chunk.columns}") # Debugging line
	st.write(f"Trying to access row {row}, col {col}") # Debugging line
	value = chunk.iloc[row, col]
	st.write(f"Value accessed: {value}") # Debugging line
	answers.append(value)
	except Exception as e:
	st.write(f"An error occurred: {e}")
	else:
	cell_values = []
	for coordinate in coordinates:
	row, col = coordinate
	try:
	value = chunk.iloc[row, col]
	cell_values.append(value)
	except Exception as e:
	st.write(f"An error occurred: {e}")
	answers.append(", ".join(map(str, cell_values)))

	return answers

	MAX_ROWS_PER_CHUNK = 200

	def summarize_map_reduce(data, questions):
	dataframe = pd.read_csv(StringIO(data))
	num_chunks = len(dataframe) // MAX_ROWS_PER_CHUNK + 1
	dataframe_chunks = [deepcopy(chunk) for chunk in np.array_split(dataframe, num_chunks)]
	all_answers = []
	for chunk in dataframe_chunks:
	chunk_answers = ask_llm_chunk(chunk, questions)
	all_answers.extend(chunk_answers)
	return all_answers

	st.title("TAPAS Table Question Answering with Weaviate Integration")

	# UI Input for Class and Description
	class_name = st.text_input("Enter the class name for your CSV data:")
	class_description = st.text_input("Enter a description for your class:")

	# Upload CSV data
	csv_file = st.file_uploader("Upload a CSV file", type=["csv"])
	if csv_file is not None:
	data = csv_file.read().decode("utf-8")
	dataframe = pd.read_csv(StringIO(data))
	st.write("CSV Data Preview:")
	st.write(dataframe.head())

	# Ingest data to Weaviate
	if st.button("Ingest to Weaviate"):
	ingest_data_to_weaviate(dataframe, class_name, class_description)
	st.write("Data ingested successfully!")

	# Input for questions
	questions = st.text_area("Enter your questions (one per line)")
	questions = questions.split("\n") # split questions by line
	questions = [q for q in questions if q] # remove empty strings

	if st.button("Submit"):
	if data and questions:
	answers = summarize_map_reduce(data, questions)
	st.write("Answers:")
	for q, a in zip(questions, answers):
	st.write(f"Question: {q}")
	st.write(f"Answer: {a}")

	# Add Ctrl+Enter functionality for submitting the questions
	st.markdown("""
	<script>
	document.addEventListener("DOMContentLoaded", function(event) {
	document.addEventListener("keydown", function(event) {
	if (event.ctrlKey && event.key === "Enter") {
	document.querySelector(".stButton button").click();
	}
	});
	});
	</script>
	""", unsafe_allow_html=True)