Spaces:
Build error
Build error
| import streamlit as st | |
| import pandas as pd | |
| import requests | |
| from bs4 import BeautifulSoup | |
| import google.generativeai as genai | |
| import os | |
| from io import BytesIO, TextIOWrapper | |
| import PyPDF2 | |
| import docx2txt | |
| import csv | |
| from huggingface_hub import InferenceClient | |
| st.title('π AI Playground ') | |
| st.text('Web Scraping with Pandas and Streamlit, Gemini, Mistral, and Phi-3') | |
| Model = st.selectbox("Select your prefered model:", ["GEMINI", "MISTRAL8X", "PHI-3", "Custom Models"]) | |
| if Model == "GEMINI": | |
| tkey = st.text_input("Your Token or API key here:", "") | |
| # Button to trigger scraping | |
| # if st.button('Scrape Data'): | |
| # if url: | |
| # if 'https://' not in url: | |
| # url = 'https://' + url | |
| # scraped_data = scrape_data(url) | |
| # paragraph = ' '.join(scraped_data['Text'].dropna()) | |
| # st.write(scraped_data) | |
| # st.write(paragraph) | |
| # else: | |
| # st.write('Please enter a valid website URL') | |
| # Set up the model | |
| generation_config = { | |
| "temperature": 0.9, | |
| "top_p": 1, | |
| "top_k": 1, | |
| "max_output_tokens": 2048, | |
| } | |
| safety_settings = [ | |
| { | |
| "category": "HARM_CATEGORY_HARASSMENT", | |
| "threshold": "BLOCK_MEDIUM_AND_ABOVE", | |
| }, | |
| { | |
| "category": "HARM_CATEGORY_HATE_SPEECH", | |
| "threshold": "BLOCK_MEDIUM_AND_ABOVE", | |
| }, | |
| { | |
| "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", | |
| "threshold": "BLOCK_MEDIUM_AND_ABOVE", | |
| }, | |
| { | |
| "category": "HARM_CATEGORY_DANGEROUS_CONTENT", | |
| "threshold": "BLOCK_MEDIUM_AND_ABOVE", | |
| }, | |
| ] | |
| model = genai.GenerativeModel(model_name="gemini-pro", | |
| generation_config=generation_config, | |
| safety_settings=safety_settings) | |
| genai.configure(api_key=tkey) | |
| def gai(inp): | |
| return model.generate_content(inp).text | |
| ################################################################################################################ | |
| else: | |
| tkey = st.text_input("HuggingFace token here:", "") | |
| if Model == "MISTRAL8X": | |
| mkey= "mistralai/Mixtral-8x7B-Instruct-v0.1" | |
| elif Model == "PHI-3": | |
| mkey = "microsoft/Phi-3-mini-4k-instruct" | |
| else: | |
| mkey = st.text_input("Your HuggingFace Model String here:", "") | |
| def format_prompt(message, history): | |
| prompt = "" | |
| for user_prompt, bot_response in history: | |
| prompt += f"[INST] {user_prompt} [/INST]" | |
| prompt += f" {bot_response} " | |
| prompt += f"[INST] {message} [/INST]" | |
| return prompt | |
| def generate(prompt, history=[], temperature=0.9, max_new_tokens=1024, top_p=0.95, repetition_penalty=1.0): | |
| temperature = float(temperature) | |
| if temperature < 1e-2: | |
| temperature = 1e-2 | |
| top_p = float(top_p) | |
| generate_kwargs = dict( | |
| temperature=temperature, | |
| max_new_tokens=max_new_tokens, | |
| top_p=top_p, | |
| repetition_penalty=repetition_penalty, | |
| do_sample=True, | |
| seed=42, | |
| ) | |
| formatted_prompt = format_prompt(prompt, history) | |
| client = InferenceClient(model= mkey, token=tkey) | |
| stream = client.text_generation(formatted_prompt, **generate_kwargs, stream=True, details=True, return_full_text=False) | |
| output = "" | |
| for response in stream: | |
| output += response.token.text | |
| output = output.replace("<s>", "").replace("</s>", "") | |
| yield output | |
| return output | |
| # history = [] | |
| # while True: | |
| # user_input = input("You: ") | |
| # if user_input.lower() == "off": | |
| # break | |
| # history.append((user_input, "")) | |
| # for response in generate(user_input, history): | |
| # print("Bot:", response) | |
| def gai(query): | |
| x='' | |
| for response in generate(query): | |
| x+=response | |
| return x | |
| ################################################################################################################ | |
| # bg image | |
| page_bg_img = """ | |
| <style> | |
| [data-testid="stAppViewContainer"] { | |
| background-image: url( | |
| https://cdn.wallpapersafari.com/41/41/vIdSZT.jpg | |
| ); | |
| background-size: cover; | |
| } | |
| </style> | |
| """ | |
| st.markdown(page_bg_img, unsafe_allow_html=True) | |
| inp = st.text_input("Enter a prompt and let AI craft stories, poems, code, and more.", "") | |
| # Function to scrape data | |
| def scrape_data(url): | |
| # Send HTTP request and parse content | |
| response = requests.get(url) | |
| # print(response) | |
| soup = BeautifulSoup(response.content, 'html.parser') | |
| # Scraping logic - use BeautifulSoup to find and extract various types of content | |
| texts = [element.text for element in soup.find_all(['p', 'a', 'img'])] | |
| links = [element.get('href') for element in soup.find_all('a') if element.get('href')] | |
| images = [element.get('src') for element in soup.find_all('img') if element.get('src')] | |
| # Ensure all lists are of the same length by padding the shorter ones with None | |
| max_length = max(len(texts), len(links), len(images)) | |
| texts += [None] * (max_length - len(texts)) | |
| links += [None] * (max_length - len(links)) | |
| images += [None] * (max_length - len(images)) | |
| # Create a DataFrame using pandas for texts, links, and images | |
| data = {'Text': texts, 'Links': links, 'Images': images} | |
| df = pd.DataFrame(data) | |
| # return the processed data | |
| return df | |
| # Function to extract text from a PDF file | |
| def extract_text_from_pdf(file_bytes): | |
| pdf_reader = PyPDF2.PdfReader(BytesIO(file_bytes)) | |
| num_pages = len(pdf_reader.pages) | |
| text = "" | |
| for page_num in range(num_pages): | |
| page = pdf_reader.pages[page_num] | |
| text += page.extract_text() | |
| return text.replace('\t', ' ').replace('\n', ' ') | |
| # Function to extract text from a TXT file | |
| def extract_text_from_txt(file_bytes): | |
| text = file_bytes.decode('utf-8') | |
| return text | |
| # Function to extract text from a DOCX file | |
| def extract_text_from_docx(file_bytes): | |
| docx = docx2txt.process(BytesIO(file_bytes)) | |
| return docx.replace('\t', ' ').replace('\n', ' ') | |
| def extract_text_from_csv(file_bytes, encoding='utf-8'): | |
| # Convert bytes to text using the specified encoding | |
| file_text = file_bytes.decode(encoding) | |
| # Use CSV reader to read the content | |
| csv_reader = csv.reader(TextIOWrapper(BytesIO(file_text.encode(encoding)), encoding=encoding)) | |
| # Concatenate all rows and columns into a single text | |
| text = "" | |
| for row in csv_reader: | |
| text += ' '.join(row) + ' ' | |
| return text.replace('\t', ' ').replace('\n', ' ') | |
| url_input = st.checkbox("Use website input") | |
| url = "" | |
| if url_input: | |
| # Input for the website URL | |
| url = st.text_input('Enter the website URL (optional): ', '') | |
| file_input = st.checkbox("Use file input") | |
| uploaded_file = None | |
| sp_prompt = "" | |
| prompt_input = st.checkbox("Use special prompt input") | |
| if prompt_input: | |
| sp_prompt = st.selectbox("Special Prompt (Optional):", [ | |
| "Prompt A: Explain the following with proper details.", | |
| "Prompt B: Describe the whole thing in a nutshell.", | |
| "Prompt C: How this can be useful for us?" | |
| ]) | |
| if file_input: | |
| # Add file uploader | |
| st.write("Upload a PDF, TXT, or DOCX file to extract the text.") | |
| uploaded_file = st.file_uploader("Choose a file") | |
| if uploaded_file: | |
| # Get the file extension | |
| file_name, file_extension = os.path.splitext(uploaded_file.name) | |
| if file_extension: | |
| # Extract text based on the file extension | |
| if file_extension == ".pdf": | |
| uploaded_file = extract_text_from_pdf(uploaded_file.getvalue()) | |
| elif file_extension == ".txt": | |
| uploaded_file = extract_text_from_txt(uploaded_file.getvalue()) | |
| elif file_extension == ".docx": | |
| uploaded_file = extract_text_from_docx(uploaded_file.getvalue()) | |
| elif file_extension == ".csv": | |
| uploaded_file = extract_text_from_csv(uploaded_file.getvalue()) | |
| else: | |
| st.error("Unsupported file type.") | |
| output = '' | |
| previous_responses = [] | |
| if st.button("Generate"): | |
| if tkey == '': | |
| st.error("Need to input Token or API key.") | |
| if url: | |
| if 'https://' not in url: | |
| url = 'https://' + url | |
| scraped_data = scrape_data(url) | |
| paragraph = ' '.join(scraped_data['Text'].dropna()) | |
| # st.write(scraped_data) | |
| # st.write(paragraph) | |
| inp = paragraph + ' ' +"Take the given data above, as information and generate a response based on this prompt: " + inp | |
| if sp_prompt: | |
| inp = inp + " " + sp_prompt | |
| if uploaded_file: | |
| inp = inp + " " + uploaded_file | |
| if inp: | |
| # st.write(inp) | |
| output = gai(inp) | |
| st.write(output) | |
| # # Add response to the list of previous_responses | |
| # previous_responses.append(output) | |
| # # Display all previous responses | |
| # st.subheader("Previous Responses:") | |
| # for i, response in enumerate(previous_responses, start=1): | |
| # st.write(f"{i}. {response}") | |
| # Add download button | |
| if output is not None: | |
| # filename = 'Generated_Answer.txt' | |
| # with open(filename, 'w') as f: | |
| # f.write(output) | |
| # Add select box | |
| ofType = 'txt' | |
| #ofType = st.selectbox("Chose an output file type: ", ["TXT", "PY", "HTML"]) | |
| st.download_button("Download File", data = output, file_name= f"Generated Answer.{ofType}") | |
| else: | |
| st.error("Please enter a prompt to generate text.") | |
| #st.subheader("[π...Visit my GitHub Profile...π](https://github.com/NafisRayan)") | |
| # streamlit run app.py |