Spaces:

rianders
/

mpi_data_store

Sleeping

App Files Files Community

rianders commited on Apr 12, 2024

Commit

af4dc9a

1 Parent(s): 2ac76dd

Removed extra pages

Browse files

Files changed (6) hide show

pages/data_loading.py +0 -39
pages/data_source_config.py +0 -41
pages/fetch_resources_store_collection.py +0 -59
pages/file_web_source_collection.py +0 -87
pages/model_selection.py +0 -52
pages/processing_embedding.py +0 -53

pages/data_loading.py DELETED Viewed

@@ -1,39 +0,0 @@
-import streamlit as st
-def main():
-    st.title("Data Loading")
-    # Introduction or instruction
-    st.write("Select directories and file types to process from the configured data source.")
-    # Assume we have a function `list_repo_directories(repo_url)` that returns a list of directories in the repo
-    # This is a placeholder list to demonstrate UI elements
-    directories = ["src", "docs", "examples", "tests"]  # Example directory names
-    # Directory selection
-    selected_directories = st.multiselect("Select Directories", options=directories, default=directories)
-    if selected_directories:
-        # Save the selected directories for later processing
-        st.session_state['selected_directories'] = selected_directories
-        st.success(f"Selected directories: {', '.join(selected_directories)}")
-    else:
-        st.error("Please select at least one directory.")
-    # File type filtering
-    file_types = ["pdf", "txt", "md"]  # Example file types
-    selected_file_types = st.multiselect("Select File Types to Include", options=file_types, default=file_types)
-    if selected_file_types:
-        # Save the selected file types for later processing
-        st.session_state['selected_file_types'] = selected_file_types
-        st.success(f"Selected file types: {', '.join(selected_file_types)}")
-    else:
-        st.error("Please select at least one file type.")
-    # Optional: Navigation or action buttons
-    # Example: Button to proceed to the next step if this page's task is completed
-    if st.button("Proceed to Model Selection and Configuration"):
-        # Change the page in the session state, assuming you have set up session-based navigation in app.py
-        st.session_state.page = 'model_selection'
-if __name__ == "__main__":
-    main()

pages/data_source_config.py DELETED Viewed

@@ -1,41 +0,0 @@
-import streamlit as st
-import os
-def main():
-    st.title("Data Source Configuration")
-    # Explanation or introduction
-    st.write("Configure the source from which to mine data, including the GitHub repository and the output directory for storing generated data.")
-    # Repository selection
-    repo_url = st.text_input("GitHub Repository URL", "https://github.com/username/repository")
-    # Validate the URL (basic validation for demonstration)
-    if "github.com" not in repo_url:
-        st.error("Please enter a valid GitHub repository URL.")
-    else:
-        # Assuming validation passed, store the repo URL in session state or proceed with further processing
-        st.session_state['repo_url'] = repo_url
-        st.success("Repository URL saved.")
-    # Output directory selection
-    default_dir = os.path.join(".", "output_data") # Default directory path
-    out_dir = st.text_input("Output Directory for Generated Data", value=default_dir)
-    # Directory existence check (Create if doesn't exist)
-    if st.button("Save Output Directory"):
-        try:
-            os.makedirs(out_dir, exist_ok=True) # Create the directory if it does not exist
-            st.session_state['output_dir'] = out_dir
-            st.success(f"Output directory set to: {out_dir}")
-        except Exception as e:
-            st.error(f"Failed to create the directory: {str(e)}")
-    # Optional: Provide navigation or action buttons
-    # For example, a button to proceed to the next step if this page's task is completed
-    if st.button("Proceed to Data Loading"):
-        # Change the page in the session state, assuming you have set up session-based navigation in app.py
-        st.session_state.page = 'data_loading'
-if __name__ == "__main__":
-    main()

pages/fetch_resources_store_collection.py DELETED Viewed

@@ -1,59 +0,0 @@
-import streamlit as st
-from langchain_community.document_loaders import AsyncHtmlLoader
-from langchain.schema import Document
-import json
-from typing import Iterable
-# Placeholder for async fetch function (adjust based on actual async handling in your environment)
-async def fetch_documents(urls):
-    loader = AsyncHtmlLoader(urls)
-    docs = await loader.load()
-    return docs
-def save_docs_to_jsonl(array: Iterable[Document], file_path: str) -> None:
-    with open(file_path, 'w') as jsonl_file:
-        for doc in array:
-            jsonl_file.write(json.dumps(doc.to_dict()) + '\n')  # Assuming Document objects have a to_dict method
-def load_docs_from_jsonl(file_path) -> Iterable[Document]:
-    array = []
-    with open(file_path, 'r') as jsonl_file:
-        for line in jsonl_file:
-            data = json.loads(line)
-            obj = Document(**data)
-            array.append(obj)
-    return array
-def fetch_clean_organize_page():
-    st.title("Fetch, Clean, and Organize Documents")
-    # Initialize 'selected_urls' at the start of your app if it doesn't exist
-    if 'selected_urls' not in st.session_state:
-        st.session_state['selected_urls'] = []  # Default to an empty list
-    urls = st.session_state['selected_urls']
-    if st.button("Fetch Documents"):
-        # Async fetching operation placeholder
-        # Adjust based on your async handling strategy
-        docs = fetch_documents(urls)  # This needs proper async handling
-        st.session_state['docs'] = docs  # Assuming docs are fetched and stored correctly
-    if 'docs' in st.session_state:
-        st.write(f"Fetched {len(st.session_state['docs'])} documents.")
-        if st.button("Save Documents as JSON"):
-            save_docs_to_jsonl(st.session_state['docs'], "documents.jsonl")
-            st.success("Documents saved as JSON.")
-            # Provide download link (streamlit >= 0.88.0)
-            with open("documents.jsonl", "rb") as file:
-                btn = st.download_button(
-                        label="Download JSON",
-                        data=file,
-                        file_name="documents.jsonl",
-                        mime="application/octet-stream")
-# Assuming this function is called in your app
-fetch_clean_organize_page()

pages/file_web_source_collection.py DELETED Viewed

@@ -1,87 +0,0 @@
-import streamlit as st
-import pandas as pd
-import requests
-from bs4 import BeautifulSoup
-from urllib.parse import urljoin, urlparse
-def find_linked_urls(url):
-    try:
-        response = requests.get(url)
-        if response.status_code == 200:
-            soup = BeautifulSoup(response.text, 'html.parser')
-            links = soup.find_all('a')
-            urls = {link.get('href') for link in links if link.get('href') is not None}
-            return urls
-        else:
-            st.write(f"Failed to retrieve {url}")
-    except Exception as e:
-        st.write(f"An error occurred with {url}: {e}")
-    return set()
-def convert_to_absolute_urls(base_url, links):
-    return {urljoin(base_url, link) if not link.startswith('http') else link for link in links}
-def categorize_links(base_url, links):
-    internal_links, external_links = set(), set()
-    for link in links:
-        if urlparse(link).netloc == urlparse(base_url).netloc:
-            internal_links.add(link)
-        else:
-            external_links.add(link)
-    return internal_links, external_links
-def main():
-    st.title("Data Source Configuration")
-    if 'scanned_urls' not in st.session_state:
-        st.session_state['scanned_urls'] = {}
-    st.subheader("Scan Websites for URLs")
-    url_input = st.text_area("Enter URLs to scan, separated by new lines:")
-    url_list = [url.strip() for url in url_input.strip().split('\n') if url.strip()]  # Splitting and cleaning input
-    scan_button_clicked = st.button("Scan URLs")
-    if scan_button_clicked or st.session_state['scanned_urls']:
-        if scan_button_clicked:
-            for url in url_list:
-                unique_urls = find_linked_urls(url)
-                absolute_urls = convert_to_absolute_urls(url, unique_urls)
-                internal_links, external_links = categorize_links(url, absolute_urls)
-                st.session_state['scanned_urls'][url] = {"internal": internal_links, "external": external_links}
-        selected_urls = []
-        for base_url, links in st.session_state['scanned_urls'].items():
-            st.write(f"Base URL: {base_url}")
-            include_all_internal = st.checkbox(f"Include all internal links from {base_url}", key=f"all_{base_url}")
-            if include_all_internal:
-                selected_urls.extend(links["internal"])
-            else:
-                selected_internal = [link for link in links["internal"] if st.checkbox(link, key=link)]
-                selected_urls.extend(selected_internal)
-            if links["external"]:
-                st.write("External links:")
-                for link in links["external"]:
-                    st.write(link)
-        if selected_urls:
-            df_selected_urls = pd.DataFrame(selected_urls, columns=['Selected URLs'])
-            st.write(df_selected_urls)
-            st.session_state['selected_urls'] = df_selected_urls
-            # Convert DataFrame to CSV for download
-            csv = df_selected_urls.to_csv(index=False).encode('utf-8')
-            st.download_button(
-                label="Download selected URLs as CSV",
-                data=csv,
-                file_name='selected_urls.csv',
-                mime='text/csv',
-            )
-if __name__ == "__main__":
-    main()

pages/model_selection.py DELETED Viewed

@@ -1,52 +0,0 @@
-import streamlit as st
-def main():
-    st.title("Model Selection and Configuration")
-    # Introduction
-    st.write("Select the embedding model and the large language model (LLM) for processing.")
-    # Embedding Model Selection
-    embedding_models = ["thenlper/gte-small", "sentence-transformers/all-MiniLM-L6-v2", "other"]
-    selected_embedding_model = st.selectbox("Select Embedding Model", options=embedding_models)
-    # LLM Model Selection
-    llm_models = ["mistralai/Mistral-7B-Instruct-v0.2", "gpt-3.5-turbo", "other"]
-    selected_llm_model = st.selectbox("Select LLM Model", options=llm_models)
-    # Display selections (for demonstration)
-    st.write("Selected Embedding Model:", selected_embedding_model)
-    st.write("Selected LLM Model:", selected_llm_model)
-    # Configuration options for the selected models
-    st.header("Model Configuration")
-    # Embedding Model Configuration (example)
-    if selected_embedding_model == "thenlper/gte-small":
-        # Placeholder for model-specific configuration options
-        st.write("No additional configuration required for this model.")
-    else:
-        # Configuration for other models
-        st.write("Configuration options for other models will appear here.")
-    # LLM Model Configuration (example)
-    if selected_llm_model == "mistralai/Mistral-7B-Instruct-v0.2":
-        max_tokens = st.slider("Max Tokens", min_value=100, max_value=1000, value=250)
-        temperature = st.slider("Temperature", min_value=0.0, max_value=1.0, value=0.7, step=0.01)
-    else:
-        # Configuration for other models
-        st.write("Configuration options for other models will appear here.")
-    # Save model selections and configurations
-    if st.button("Save Model Configuration"):
-        st.session_state['selected_embedding_model'] = selected_embedding_model
-        st.session_state['selected_llm_model'] = selected_llm_model
-        # Assuming configurations are more complex and vary per model, you might want to store them differently
-        st.session_state['llm_model_config'] = {"max_tokens": max_tokens, "temperature": temperature}
-        st.success("Model configurations saved.")
-        # Optional: Proceed to the next step
-        # st.session_state.page = 'processing_embedding'
-if __name__ == "__main__":
-    main()

pages/processing_embedding.py DELETED Viewed

@@ -1,53 +0,0 @@
-import streamlit as st
-import time  # Used to simulate processing time
-def main():
-    st.title("Processing and Embedding")
-    # Introduction
-    st.write("Process the selected data using the configured models and save the results.")
-    # Start Processing Button
-    if st.button("Start Processing"):
-        with st.spinner("Processing..."):
-            # Simulate processing time
-            time.sleep(5)  # Simulate processing time. Replace with actual processing logic.
-            # Update session state or variables to indicate processing is complete
-            st.session_state['processing_complete'] = True
-            st.success("Processing completed successfully!")
-    # Show progress only if processing has started or completed
-    if 'processing_complete' in st.session_state and st.session_state['processing_complete']:
-        st.progress(100)
-        st.write("Data has been processed and embedded successfully.")
-    else:
-        st.progress(0)
-    # Parameter Tuning Section (Placeholder)
-    st.header("Parameter Tuning")
-    st.write("Adjust processing parameters if necessary. (This section is a placeholder.)")
-    # Saving Options
-    st.header("Save Results")
-    if st.checkbox("Save Preprocessed Pages"):
-        # Placeholder for saving logic
-        st.write("Preprocessed pages will be saved.")
-    if st.checkbox("Save Processed Pages"):
-        # Placeholder for saving logic
-        st.write("Processed pages will be saved.")
-    if st.checkbox("Save Vectors Store Data"):
-        # Placeholder for saving logic
-        st.write("Vectors store data will be saved.")
-    # Optional: Provide navigation to next steps or back to configuration
-    col1, col2 = st.columns(2)
-    with col1:
-        if st.button("Back to Model Selection"):
-            st.session_state.page = 'model_selection'
-    with col2:
-        if st.button("Complete and Exit"):
-            st.session_state.page = 'main_page'  # Assuming you want to navigate back to the main page
-if __name__ == "__main__":
-    main()