Spaces:
Runtime error
Runtime error
| import streamlit as st | |
| import pdfplumber | |
| import torch | |
| from transformers import RobertaTokenizer, RobertaModel | |
| import nltk | |
| import openai | |
| from torch import nn | |
| import torch.nn.functional as F | |
| from nltk.tokenize import sent_tokenize | |
| import os | |
| print(os.listdir('.')) | |
| nltk.download('punkt') | |
| class Bert_model(nn.Module): | |
| def __init__(self, hidden_size, dropout_rate): | |
| super(Bert_model, self).__init__() | |
| self.hidden_size = hidden_size | |
| self.bert = RobertaModel.from_pretrained('deepset/roberta-base-squad2') | |
| self.cls_prj = nn.Linear(hidden_size, hidden_size, bias=True) | |
| self.cls_dropout = nn.Dropout(dropout_rate) | |
| self.cls_final = nn.Linear(hidden_size, 2, bias=True) | |
| def forward(self, input_ids, attention_mask): | |
| bert_outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask) | |
| bert_sequence_output = bert_outputs.last_hidden_state | |
| bert_pooled_output = bert_sequence_output[:, 0, :] | |
| pooled_output = self.cls_prj(bert_pooled_output) | |
| pooled_output = self.cls_dropout(pooled_output) | |
| logits = self.cls_final(pooled_output) | |
| return logits | |
| model_path = "model.pt" | |
| state_dict = torch.load(model_path) | |
| device = torch.device("cuda:0") | |
| model = Bert_model(hidden_size=768, dropout_rate=0.1) | |
| model = nn.DataParallel(model) | |
| model.load_state_dict(state_dict) | |
| model = model.to(device) | |
| model.eval() | |
| tokenizer = RobertaTokenizer.from_pretrained('deepset/roberta-base-squad2') | |
| def preprocess_pdf(pdf_path, tokenizer): | |
| with pdfplumber.open(pdf_path) as pdf: | |
| text = " ".join([page.extract_text() for page in pdf.pages[2:]]) | |
| tokenized_text = tokenizer.encode_plus( | |
| text, | |
| add_special_tokens=True, | |
| max_length=512, | |
| padding='max_length', | |
| return_attention_mask=True | |
| ) | |
| input_ids = torch.tensor([tokenized_text['input_ids']]) | |
| attention_mask = torch.tensor([tokenized_text['attention_mask']]) | |
| return input_ids, attention_mask, text | |
| def translate_text(text, target_language): | |
| response = openai.ChatCompletion.create( | |
| model="gpt-4-1106-preview", | |
| messages=[ | |
| {"role": "system", "content": "You are a helpful assistant that translates English text to other languages."}, | |
| {"role": "user", "content": f'Translate the following English text to {target_language}: "{text}"'}, | |
| ], | |
| ) | |
| return response.choices[0].message['content'] | |
| def explain_term(term): | |
| response = openai.ChatCompletion.create( | |
| model="gpt-4-1106-preview", | |
| messages=[ | |
| { | |
| "role": "system", | |
| "content": "You are a helpful assistant that provides definitions." | |
| }, | |
| { | |
| "role": "user", | |
| "content": f"Explain the term: {term}" | |
| }, | |
| ], | |
| ) | |
| return response['choices'][0]['message']['content'] | |
| st.title('FinChat') | |
| # api_key = st.text_input("Enter your OpenAI API key:", type="password") | |
| # if api_key: | |
| # try: | |
| # openai.api_key = api_key | |
| # openai.ChatCompletion.create( | |
| # model="gpt-4-1106-preview", | |
| # messages=[ | |
| # {"role": "system", "content": "You are a helpful assistant."}, | |
| # {"role": "user", "content": "Hello"}, | |
| # ], | |
| # ) | |
| # st.success("API key is valid!") | |
| # except Exception as e: | |
| # st.error(f"Failed to validate API key: {e}") | |
| # else: | |
| # st.warning("Please enter your OpenAI API key.") | |
| api_key = st.secrets["api_key"] | |
| openai.api_key = api_key | |
| uploaded_file = st.file_uploader("Choose a PDF file", type="pdf") | |
| language = st.selectbox('Select your language', ['English', 'French','Chinese','Korean','Spanish','German','Japanese']) | |
| if uploaded_file is not None: | |
| with open("temp.pdf", "wb") as f: | |
| f.write(uploaded_file.getbuffer()) | |
| input_ids, attention_mask, text = preprocess_pdf("temp.pdf", tokenizer) | |
| st.write('File successfully uploaded and processed') | |
| question = st.text_input("Enter your question:") | |
| if question: | |
| sentences = sent_tokenize(text) | |
| predictions = [] | |
| for sentence in sentences: | |
| inputs = tokenizer.encode_plus(question, sentence, return_tensors='pt', padding='max_length', truncation=True, max_length=512) | |
| input_ids = inputs['input_ids'].to(device) | |
| attention_mask = inputs['attention_mask'].to(device) | |
| with torch.no_grad(): | |
| outputs = model(input_ids, attention_mask) | |
| probabilities = F.softmax(outputs, dim=1) | |
| max_value, max_index = torch.max(probabilities, dim=1) | |
| prediction = max_index.item() | |
| predictions.append((sentence, prediction, probabilities[0].tolist())) | |
| predictions.sort(key=lambda pair: pair[1], reverse=True) | |
| top_5_sentences = predictions[:13] | |
| chat_history = "\n".join([sentence[0] for sentence in top_5_sentences]) | |
| response = openai.ChatCompletion.create( | |
| model="gpt-4-1106-preview", | |
| messages=[ | |
| {"role": "system", "content": "You are a helpful generator which read the short paragraphs and answer the question."}, | |
| {"role": "user", "content": chat_history}, | |
| {"role": "user", "content": question}, | |
| ] | |
| ) | |
| if language != 'English': | |
| response_content = translate_text(response.choices[0].message['content'], language) | |
| else: | |
| response_content = response.choices[0].message['content'] | |
| st.text("Answer: " + response_content) | |
| term = st.text_input("Enter a term you want to define:") | |
| if term: | |
| definition = explain_term(term) | |
| if language != 'English': | |
| definition = translate_text(definition, language) | |
| st.text("Definition: " + definition) |