Spaces:
Sleeping
Sleeping
Removed extra pages
Browse files- pages/data_loading.py +0 -39
- pages/data_source_config.py +0 -41
- pages/fetch_resources_store_collection.py +0 -59
- pages/file_web_source_collection.py +0 -87
- pages/model_selection.py +0 -52
- pages/processing_embedding.py +0 -53
pages/data_loading.py
DELETED
|
@@ -1,39 +0,0 @@
|
|
| 1 |
-
import streamlit as st
|
| 2 |
-
|
| 3 |
-
def main():
|
| 4 |
-
st.title("Data Loading")
|
| 5 |
-
|
| 6 |
-
# Introduction or instruction
|
| 7 |
-
st.write("Select directories and file types to process from the configured data source.")
|
| 8 |
-
|
| 9 |
-
# Assume we have a function `list_repo_directories(repo_url)` that returns a list of directories in the repo
|
| 10 |
-
# This is a placeholder list to demonstrate UI elements
|
| 11 |
-
directories = ["src", "docs", "examples", "tests"] # Example directory names
|
| 12 |
-
|
| 13 |
-
# Directory selection
|
| 14 |
-
selected_directories = st.multiselect("Select Directories", options=directories, default=directories)
|
| 15 |
-
if selected_directories:
|
| 16 |
-
# Save the selected directories for later processing
|
| 17 |
-
st.session_state['selected_directories'] = selected_directories
|
| 18 |
-
st.success(f"Selected directories: {', '.join(selected_directories)}")
|
| 19 |
-
else:
|
| 20 |
-
st.error("Please select at least one directory.")
|
| 21 |
-
|
| 22 |
-
# File type filtering
|
| 23 |
-
file_types = ["pdf", "txt", "md"] # Example file types
|
| 24 |
-
selected_file_types = st.multiselect("Select File Types to Include", options=file_types, default=file_types)
|
| 25 |
-
if selected_file_types:
|
| 26 |
-
# Save the selected file types for later processing
|
| 27 |
-
st.session_state['selected_file_types'] = selected_file_types
|
| 28 |
-
st.success(f"Selected file types: {', '.join(selected_file_types)}")
|
| 29 |
-
else:
|
| 30 |
-
st.error("Please select at least one file type.")
|
| 31 |
-
|
| 32 |
-
# Optional: Navigation or action buttons
|
| 33 |
-
# Example: Button to proceed to the next step if this page's task is completed
|
| 34 |
-
if st.button("Proceed to Model Selection and Configuration"):
|
| 35 |
-
# Change the page in the session state, assuming you have set up session-based navigation in app.py
|
| 36 |
-
st.session_state.page = 'model_selection'
|
| 37 |
-
|
| 38 |
-
if __name__ == "__main__":
|
| 39 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
pages/data_source_config.py
DELETED
|
@@ -1,41 +0,0 @@
|
|
| 1 |
-
import streamlit as st
|
| 2 |
-
import os
|
| 3 |
-
|
| 4 |
-
def main():
|
| 5 |
-
st.title("Data Source Configuration")
|
| 6 |
-
|
| 7 |
-
# Explanation or introduction
|
| 8 |
-
st.write("Configure the source from which to mine data, including the GitHub repository and the output directory for storing generated data.")
|
| 9 |
-
|
| 10 |
-
# Repository selection
|
| 11 |
-
repo_url = st.text_input("GitHub Repository URL", "https://github.com/username/repository")
|
| 12 |
-
|
| 13 |
-
# Validate the URL (basic validation for demonstration)
|
| 14 |
-
if "github.com" not in repo_url:
|
| 15 |
-
st.error("Please enter a valid GitHub repository URL.")
|
| 16 |
-
else:
|
| 17 |
-
# Assuming validation passed, store the repo URL in session state or proceed with further processing
|
| 18 |
-
st.session_state['repo_url'] = repo_url
|
| 19 |
-
st.success("Repository URL saved.")
|
| 20 |
-
|
| 21 |
-
# Output directory selection
|
| 22 |
-
default_dir = os.path.join(".", "output_data") # Default directory path
|
| 23 |
-
out_dir = st.text_input("Output Directory for Generated Data", value=default_dir)
|
| 24 |
-
|
| 25 |
-
# Directory existence check (Create if doesn't exist)
|
| 26 |
-
if st.button("Save Output Directory"):
|
| 27 |
-
try:
|
| 28 |
-
os.makedirs(out_dir, exist_ok=True) # Create the directory if it does not exist
|
| 29 |
-
st.session_state['output_dir'] = out_dir
|
| 30 |
-
st.success(f"Output directory set to: {out_dir}")
|
| 31 |
-
except Exception as e:
|
| 32 |
-
st.error(f"Failed to create the directory: {str(e)}")
|
| 33 |
-
|
| 34 |
-
# Optional: Provide navigation or action buttons
|
| 35 |
-
# For example, a button to proceed to the next step if this page's task is completed
|
| 36 |
-
if st.button("Proceed to Data Loading"):
|
| 37 |
-
# Change the page in the session state, assuming you have set up session-based navigation in app.py
|
| 38 |
-
st.session_state.page = 'data_loading'
|
| 39 |
-
|
| 40 |
-
if __name__ == "__main__":
|
| 41 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
pages/fetch_resources_store_collection.py
DELETED
|
@@ -1,59 +0,0 @@
|
|
| 1 |
-
import streamlit as st
|
| 2 |
-
from langchain_community.document_loaders import AsyncHtmlLoader
|
| 3 |
-
from langchain.schema import Document
|
| 4 |
-
import json
|
| 5 |
-
from typing import Iterable
|
| 6 |
-
|
| 7 |
-
# Placeholder for async fetch function (adjust based on actual async handling in your environment)
|
| 8 |
-
async def fetch_documents(urls):
|
| 9 |
-
loader = AsyncHtmlLoader(urls)
|
| 10 |
-
docs = await loader.load()
|
| 11 |
-
return docs
|
| 12 |
-
|
| 13 |
-
def save_docs_to_jsonl(array: Iterable[Document], file_path: str) -> None:
|
| 14 |
-
with open(file_path, 'w') as jsonl_file:
|
| 15 |
-
for doc in array:
|
| 16 |
-
jsonl_file.write(json.dumps(doc.to_dict()) + '\n') # Assuming Document objects have a to_dict method
|
| 17 |
-
|
| 18 |
-
def load_docs_from_jsonl(file_path) -> Iterable[Document]:
|
| 19 |
-
array = []
|
| 20 |
-
with open(file_path, 'r') as jsonl_file:
|
| 21 |
-
for line in jsonl_file:
|
| 22 |
-
data = json.loads(line)
|
| 23 |
-
obj = Document(**data)
|
| 24 |
-
array.append(obj)
|
| 25 |
-
return array
|
| 26 |
-
|
| 27 |
-
def fetch_clean_organize_page():
|
| 28 |
-
st.title("Fetch, Clean, and Organize Documents")
|
| 29 |
-
|
| 30 |
-
# Initialize 'selected_urls' at the start of your app if it doesn't exist
|
| 31 |
-
if 'selected_urls' not in st.session_state:
|
| 32 |
-
st.session_state['selected_urls'] = [] # Default to an empty list
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
urls = st.session_state['selected_urls']
|
| 37 |
-
if st.button("Fetch Documents"):
|
| 38 |
-
# Async fetching operation placeholder
|
| 39 |
-
# Adjust based on your async handling strategy
|
| 40 |
-
docs = fetch_documents(urls) # This needs proper async handling
|
| 41 |
-
st.session_state['docs'] = docs # Assuming docs are fetched and stored correctly
|
| 42 |
-
|
| 43 |
-
if 'docs' in st.session_state:
|
| 44 |
-
st.write(f"Fetched {len(st.session_state['docs'])} documents.")
|
| 45 |
-
|
| 46 |
-
if st.button("Save Documents as JSON"):
|
| 47 |
-
save_docs_to_jsonl(st.session_state['docs'], "documents.jsonl")
|
| 48 |
-
st.success("Documents saved as JSON.")
|
| 49 |
-
|
| 50 |
-
# Provide download link (streamlit >= 0.88.0)
|
| 51 |
-
with open("documents.jsonl", "rb") as file:
|
| 52 |
-
btn = st.download_button(
|
| 53 |
-
label="Download JSON",
|
| 54 |
-
data=file,
|
| 55 |
-
file_name="documents.jsonl",
|
| 56 |
-
mime="application/octet-stream")
|
| 57 |
-
|
| 58 |
-
# Assuming this function is called in your app
|
| 59 |
-
fetch_clean_organize_page()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
pages/file_web_source_collection.py
DELETED
|
@@ -1,87 +0,0 @@
|
|
| 1 |
-
import streamlit as st
|
| 2 |
-
import pandas as pd
|
| 3 |
-
import requests
|
| 4 |
-
from bs4 import BeautifulSoup
|
| 5 |
-
from urllib.parse import urljoin, urlparse
|
| 6 |
-
|
| 7 |
-
def find_linked_urls(url):
|
| 8 |
-
try:
|
| 9 |
-
response = requests.get(url)
|
| 10 |
-
if response.status_code == 200:
|
| 11 |
-
soup = BeautifulSoup(response.text, 'html.parser')
|
| 12 |
-
links = soup.find_all('a')
|
| 13 |
-
urls = {link.get('href') for link in links if link.get('href') is not None}
|
| 14 |
-
return urls
|
| 15 |
-
else:
|
| 16 |
-
st.write(f"Failed to retrieve {url}")
|
| 17 |
-
except Exception as e:
|
| 18 |
-
st.write(f"An error occurred with {url}: {e}")
|
| 19 |
-
return set()
|
| 20 |
-
|
| 21 |
-
def convert_to_absolute_urls(base_url, links):
|
| 22 |
-
return {urljoin(base_url, link) if not link.startswith('http') else link for link in links}
|
| 23 |
-
|
| 24 |
-
def categorize_links(base_url, links):
|
| 25 |
-
internal_links, external_links = set(), set()
|
| 26 |
-
for link in links:
|
| 27 |
-
if urlparse(link).netloc == urlparse(base_url).netloc:
|
| 28 |
-
internal_links.add(link)
|
| 29 |
-
else:
|
| 30 |
-
external_links.add(link)
|
| 31 |
-
return internal_links, external_links
|
| 32 |
-
|
| 33 |
-
def main():
|
| 34 |
-
st.title("Data Source Configuration")
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
if 'scanned_urls' not in st.session_state:
|
| 39 |
-
st.session_state['scanned_urls'] = {}
|
| 40 |
-
|
| 41 |
-
st.subheader("Scan Websites for URLs")
|
| 42 |
-
url_input = st.text_area("Enter URLs to scan, separated by new lines:")
|
| 43 |
-
url_list = [url.strip() for url in url_input.strip().split('\n') if url.strip()] # Splitting and cleaning input
|
| 44 |
-
|
| 45 |
-
scan_button_clicked = st.button("Scan URLs")
|
| 46 |
-
if scan_button_clicked or st.session_state['scanned_urls']:
|
| 47 |
-
if scan_button_clicked:
|
| 48 |
-
for url in url_list:
|
| 49 |
-
unique_urls = find_linked_urls(url)
|
| 50 |
-
absolute_urls = convert_to_absolute_urls(url, unique_urls)
|
| 51 |
-
internal_links, external_links = categorize_links(url, absolute_urls)
|
| 52 |
-
st.session_state['scanned_urls'][url] = {"internal": internal_links, "external": external_links}
|
| 53 |
-
|
| 54 |
-
selected_urls = []
|
| 55 |
-
for base_url, links in st.session_state['scanned_urls'].items():
|
| 56 |
-
st.write(f"Base URL: {base_url}")
|
| 57 |
-
include_all_internal = st.checkbox(f"Include all internal links from {base_url}", key=f"all_{base_url}")
|
| 58 |
-
|
| 59 |
-
if include_all_internal:
|
| 60 |
-
selected_urls.extend(links["internal"])
|
| 61 |
-
else:
|
| 62 |
-
selected_internal = [link for link in links["internal"] if st.checkbox(link, key=link)]
|
| 63 |
-
selected_urls.extend(selected_internal)
|
| 64 |
-
|
| 65 |
-
if links["external"]:
|
| 66 |
-
st.write("External links:")
|
| 67 |
-
for link in links["external"]:
|
| 68 |
-
st.write(link)
|
| 69 |
-
|
| 70 |
-
if selected_urls:
|
| 71 |
-
df_selected_urls = pd.DataFrame(selected_urls, columns=['Selected URLs'])
|
| 72 |
-
st.write(df_selected_urls)
|
| 73 |
-
|
| 74 |
-
st.session_state['selected_urls'] = df_selected_urls
|
| 75 |
-
|
| 76 |
-
# Convert DataFrame to CSV for download
|
| 77 |
-
csv = df_selected_urls.to_csv(index=False).encode('utf-8')
|
| 78 |
-
|
| 79 |
-
st.download_button(
|
| 80 |
-
label="Download selected URLs as CSV",
|
| 81 |
-
data=csv,
|
| 82 |
-
file_name='selected_urls.csv',
|
| 83 |
-
mime='text/csv',
|
| 84 |
-
)
|
| 85 |
-
|
| 86 |
-
if __name__ == "__main__":
|
| 87 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
pages/model_selection.py
DELETED
|
@@ -1,52 +0,0 @@
|
|
| 1 |
-
import streamlit as st
|
| 2 |
-
|
| 3 |
-
def main():
|
| 4 |
-
st.title("Model Selection and Configuration")
|
| 5 |
-
|
| 6 |
-
# Introduction
|
| 7 |
-
st.write("Select the embedding model and the large language model (LLM) for processing.")
|
| 8 |
-
|
| 9 |
-
# Embedding Model Selection
|
| 10 |
-
embedding_models = ["thenlper/gte-small", "sentence-transformers/all-MiniLM-L6-v2", "other"]
|
| 11 |
-
selected_embedding_model = st.selectbox("Select Embedding Model", options=embedding_models)
|
| 12 |
-
|
| 13 |
-
# LLM Model Selection
|
| 14 |
-
llm_models = ["mistralai/Mistral-7B-Instruct-v0.2", "gpt-3.5-turbo", "other"]
|
| 15 |
-
selected_llm_model = st.selectbox("Select LLM Model", options=llm_models)
|
| 16 |
-
|
| 17 |
-
# Display selections (for demonstration)
|
| 18 |
-
st.write("Selected Embedding Model:", selected_embedding_model)
|
| 19 |
-
st.write("Selected LLM Model:", selected_llm_model)
|
| 20 |
-
|
| 21 |
-
# Configuration options for the selected models
|
| 22 |
-
st.header("Model Configuration")
|
| 23 |
-
|
| 24 |
-
# Embedding Model Configuration (example)
|
| 25 |
-
if selected_embedding_model == "thenlper/gte-small":
|
| 26 |
-
# Placeholder for model-specific configuration options
|
| 27 |
-
st.write("No additional configuration required for this model.")
|
| 28 |
-
else:
|
| 29 |
-
# Configuration for other models
|
| 30 |
-
st.write("Configuration options for other models will appear here.")
|
| 31 |
-
|
| 32 |
-
# LLM Model Configuration (example)
|
| 33 |
-
if selected_llm_model == "mistralai/Mistral-7B-Instruct-v0.2":
|
| 34 |
-
max_tokens = st.slider("Max Tokens", min_value=100, max_value=1000, value=250)
|
| 35 |
-
temperature = st.slider("Temperature", min_value=0.0, max_value=1.0, value=0.7, step=0.01)
|
| 36 |
-
else:
|
| 37 |
-
# Configuration for other models
|
| 38 |
-
st.write("Configuration options for other models will appear here.")
|
| 39 |
-
|
| 40 |
-
# Save model selections and configurations
|
| 41 |
-
if st.button("Save Model Configuration"):
|
| 42 |
-
st.session_state['selected_embedding_model'] = selected_embedding_model
|
| 43 |
-
st.session_state['selected_llm_model'] = selected_llm_model
|
| 44 |
-
# Assuming configurations are more complex and vary per model, you might want to store them differently
|
| 45 |
-
st.session_state['llm_model_config'] = {"max_tokens": max_tokens, "temperature": temperature}
|
| 46 |
-
st.success("Model configurations saved.")
|
| 47 |
-
|
| 48 |
-
# Optional: Proceed to the next step
|
| 49 |
-
# st.session_state.page = 'processing_embedding'
|
| 50 |
-
|
| 51 |
-
if __name__ == "__main__":
|
| 52 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
pages/processing_embedding.py
DELETED
|
@@ -1,53 +0,0 @@
|
|
| 1 |
-
import streamlit as st
|
| 2 |
-
import time # Used to simulate processing time
|
| 3 |
-
|
| 4 |
-
def main():
|
| 5 |
-
st.title("Processing and Embedding")
|
| 6 |
-
|
| 7 |
-
# Introduction
|
| 8 |
-
st.write("Process the selected data using the configured models and save the results.")
|
| 9 |
-
|
| 10 |
-
# Start Processing Button
|
| 11 |
-
if st.button("Start Processing"):
|
| 12 |
-
with st.spinner("Processing..."):
|
| 13 |
-
# Simulate processing time
|
| 14 |
-
time.sleep(5) # Simulate processing time. Replace with actual processing logic.
|
| 15 |
-
|
| 16 |
-
# Update session state or variables to indicate processing is complete
|
| 17 |
-
st.session_state['processing_complete'] = True
|
| 18 |
-
st.success("Processing completed successfully!")
|
| 19 |
-
|
| 20 |
-
# Show progress only if processing has started or completed
|
| 21 |
-
if 'processing_complete' in st.session_state and st.session_state['processing_complete']:
|
| 22 |
-
st.progress(100)
|
| 23 |
-
st.write("Data has been processed and embedded successfully.")
|
| 24 |
-
else:
|
| 25 |
-
st.progress(0)
|
| 26 |
-
|
| 27 |
-
# Parameter Tuning Section (Placeholder)
|
| 28 |
-
st.header("Parameter Tuning")
|
| 29 |
-
st.write("Adjust processing parameters if necessary. (This section is a placeholder.)")
|
| 30 |
-
|
| 31 |
-
# Saving Options
|
| 32 |
-
st.header("Save Results")
|
| 33 |
-
if st.checkbox("Save Preprocessed Pages"):
|
| 34 |
-
# Placeholder for saving logic
|
| 35 |
-
st.write("Preprocessed pages will be saved.")
|
| 36 |
-
if st.checkbox("Save Processed Pages"):
|
| 37 |
-
# Placeholder for saving logic
|
| 38 |
-
st.write("Processed pages will be saved.")
|
| 39 |
-
if st.checkbox("Save Vectors Store Data"):
|
| 40 |
-
# Placeholder for saving logic
|
| 41 |
-
st.write("Vectors store data will be saved.")
|
| 42 |
-
|
| 43 |
-
# Optional: Provide navigation to next steps or back to configuration
|
| 44 |
-
col1, col2 = st.columns(2)
|
| 45 |
-
with col1:
|
| 46 |
-
if st.button("Back to Model Selection"):
|
| 47 |
-
st.session_state.page = 'model_selection'
|
| 48 |
-
with col2:
|
| 49 |
-
if st.button("Complete and Exit"):
|
| 50 |
-
st.session_state.page = 'main_page' # Assuming you want to navigate back to the main page
|
| 51 |
-
|
| 52 |
-
if __name__ == "__main__":
|
| 53 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|