Spaces:
Build error
Build error
| from transformers import AutoTokenizer, AutoModelForTableQuestionAnswering | |
| import pandas as pd | |
| from io import StringIO | |
| from sentence_transformers import SentenceTransformer | |
| retriever = SentenceTransformer("deepset/all-mpnet-base-v2-table") | |
| def embed_table(table): | |
| processed_table = "\n".join([table.to_csv(index=False)]) | |
| return retriever.encode(processed_table) | |
| def embed_question(question): | |
| return retriever.encode(question) | |
| def initialize_tapas(): | |
| tokenizer = AutoTokenizer.from_pretrained("google/tapas-large-finetuned-wtq") | |
| model = AutoModelForTableQuestionAnswering.from_pretrained("google/tapas-large-finetuned-wtq") | |
| return tokenizer, model | |
| def ask_llm_chunk(tokenizer, model, chunk, questions): | |
| chunk = chunk.astype(str) | |
| try: | |
| inputs = tokenizer(table=chunk, queries=questions, padding="max_length", truncation=True, return_tensors="pt") | |
| except Exception as e: | |
| log_debug_info(f"Tokenization error: {e}") | |
| st.write(f"An error occurred: {e}") | |
| return ["Error occurred while tokenizing"] * len(questions) | |
| if inputs["input_ids"].shape[1] > 512: | |
| log_debug_info("Token limit exceeded for chunk") | |
| st.warning("Token limit exceeded for chunk") | |
| return ["Token limit exceeded for chunk"] * len(questions) | |
| outputs = model(**inputs) | |
| predicted_answer_coordinates, predicted_aggregation_indices = tokenizer.convert_logits_to_predictions( | |
| inputs, | |
| outputs.logits.detach(), | |
| outputs.logits_aggregation.detach() | |
| ) | |
| answers = [] | |
| for coordinates in predicted_answer_coordinates: | |
| if len(coordinates) == 1: | |
| row, col = coordinates[0] | |
| try: | |
| value = chunk.iloc[row, col] | |
| log_debug_info(f"Accessed value for row {row}, col {col}: {value}") | |
| answers.append(value) | |
| except Exception as e: | |
| log_debug_info(f"Error accessing value for row {row}, col {col}: {e}") | |
| st.write(f"An error occurred: {e}") | |
| else: | |
| cell_values = [] | |
| for coordinate in coordinates: | |
| row, col = coordinate | |
| try: | |
| value = chunk.iloc[row, col] | |
| cell_values.append(value) | |
| except Exception as e: | |
| log_debug_info(f"Error accessing value for row {row}, col {col}: {e}") | |
| st.write(f"An error occurred: {e}") | |
| answers.append(", ".join(map(str, cell_values))) | |
| return answers | |
| MAX_ROWS_PER_CHUNK = 200 | |
| def summarize_map_reduce(tokenizer, model, data, questions): | |
| dataframe = pd.read_csv(StringIO(data)) | |
| num_chunks = len(dataframe) // MAX_ROWS_PER_CHUNK + 1 | |
| dataframe_chunks = [deepcopy(chunk) for chunk in np.array_split(dataframe, num_chunks)] | |
| all_answers = [] | |
| for chunk in dataframe_chunks: | |
| chunk_answers = ask_llm_chunk(tokenizer, model, chunk, questions) | |
| all_answers.extend(chunk_answers) | |
| return all_answers | |