rianders commited on
Commit
af4dc9a
·
1 Parent(s): 2ac76dd

Removed extra pages

Browse files
pages/data_loading.py DELETED
@@ -1,39 +0,0 @@
1
- import streamlit as st
2
-
3
- def main():
4
- st.title("Data Loading")
5
-
6
- # Introduction or instruction
7
- st.write("Select directories and file types to process from the configured data source.")
8
-
9
- # Assume we have a function `list_repo_directories(repo_url)` that returns a list of directories in the repo
10
- # This is a placeholder list to demonstrate UI elements
11
- directories = ["src", "docs", "examples", "tests"] # Example directory names
12
-
13
- # Directory selection
14
- selected_directories = st.multiselect("Select Directories", options=directories, default=directories)
15
- if selected_directories:
16
- # Save the selected directories for later processing
17
- st.session_state['selected_directories'] = selected_directories
18
- st.success(f"Selected directories: {', '.join(selected_directories)}")
19
- else:
20
- st.error("Please select at least one directory.")
21
-
22
- # File type filtering
23
- file_types = ["pdf", "txt", "md"] # Example file types
24
- selected_file_types = st.multiselect("Select File Types to Include", options=file_types, default=file_types)
25
- if selected_file_types:
26
- # Save the selected file types for later processing
27
- st.session_state['selected_file_types'] = selected_file_types
28
- st.success(f"Selected file types: {', '.join(selected_file_types)}")
29
- else:
30
- st.error("Please select at least one file type.")
31
-
32
- # Optional: Navigation or action buttons
33
- # Example: Button to proceed to the next step if this page's task is completed
34
- if st.button("Proceed to Model Selection and Configuration"):
35
- # Change the page in the session state, assuming you have set up session-based navigation in app.py
36
- st.session_state.page = 'model_selection'
37
-
38
- if __name__ == "__main__":
39
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
pages/data_source_config.py DELETED
@@ -1,41 +0,0 @@
1
- import streamlit as st
2
- import os
3
-
4
- def main():
5
- st.title("Data Source Configuration")
6
-
7
- # Explanation or introduction
8
- st.write("Configure the source from which to mine data, including the GitHub repository and the output directory for storing generated data.")
9
-
10
- # Repository selection
11
- repo_url = st.text_input("GitHub Repository URL", "https://github.com/username/repository")
12
-
13
- # Validate the URL (basic validation for demonstration)
14
- if "github.com" not in repo_url:
15
- st.error("Please enter a valid GitHub repository URL.")
16
- else:
17
- # Assuming validation passed, store the repo URL in session state or proceed with further processing
18
- st.session_state['repo_url'] = repo_url
19
- st.success("Repository URL saved.")
20
-
21
- # Output directory selection
22
- default_dir = os.path.join(".", "output_data") # Default directory path
23
- out_dir = st.text_input("Output Directory for Generated Data", value=default_dir)
24
-
25
- # Directory existence check (Create if doesn't exist)
26
- if st.button("Save Output Directory"):
27
- try:
28
- os.makedirs(out_dir, exist_ok=True) # Create the directory if it does not exist
29
- st.session_state['output_dir'] = out_dir
30
- st.success(f"Output directory set to: {out_dir}")
31
- except Exception as e:
32
- st.error(f"Failed to create the directory: {str(e)}")
33
-
34
- # Optional: Provide navigation or action buttons
35
- # For example, a button to proceed to the next step if this page's task is completed
36
- if st.button("Proceed to Data Loading"):
37
- # Change the page in the session state, assuming you have set up session-based navigation in app.py
38
- st.session_state.page = 'data_loading'
39
-
40
- if __name__ == "__main__":
41
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
pages/fetch_resources_store_collection.py DELETED
@@ -1,59 +0,0 @@
1
- import streamlit as st
2
- from langchain_community.document_loaders import AsyncHtmlLoader
3
- from langchain.schema import Document
4
- import json
5
- from typing import Iterable
6
-
7
- # Placeholder for async fetch function (adjust based on actual async handling in your environment)
8
- async def fetch_documents(urls):
9
- loader = AsyncHtmlLoader(urls)
10
- docs = await loader.load()
11
- return docs
12
-
13
- def save_docs_to_jsonl(array: Iterable[Document], file_path: str) -> None:
14
- with open(file_path, 'w') as jsonl_file:
15
- for doc in array:
16
- jsonl_file.write(json.dumps(doc.to_dict()) + '\n') # Assuming Document objects have a to_dict method
17
-
18
- def load_docs_from_jsonl(file_path) -> Iterable[Document]:
19
- array = []
20
- with open(file_path, 'r') as jsonl_file:
21
- for line in jsonl_file:
22
- data = json.loads(line)
23
- obj = Document(**data)
24
- array.append(obj)
25
- return array
26
-
27
- def fetch_clean_organize_page():
28
- st.title("Fetch, Clean, and Organize Documents")
29
-
30
- # Initialize 'selected_urls' at the start of your app if it doesn't exist
31
- if 'selected_urls' not in st.session_state:
32
- st.session_state['selected_urls'] = [] # Default to an empty list
33
-
34
-
35
-
36
- urls = st.session_state['selected_urls']
37
- if st.button("Fetch Documents"):
38
- # Async fetching operation placeholder
39
- # Adjust based on your async handling strategy
40
- docs = fetch_documents(urls) # This needs proper async handling
41
- st.session_state['docs'] = docs # Assuming docs are fetched and stored correctly
42
-
43
- if 'docs' in st.session_state:
44
- st.write(f"Fetched {len(st.session_state['docs'])} documents.")
45
-
46
- if st.button("Save Documents as JSON"):
47
- save_docs_to_jsonl(st.session_state['docs'], "documents.jsonl")
48
- st.success("Documents saved as JSON.")
49
-
50
- # Provide download link (streamlit >= 0.88.0)
51
- with open("documents.jsonl", "rb") as file:
52
- btn = st.download_button(
53
- label="Download JSON",
54
- data=file,
55
- file_name="documents.jsonl",
56
- mime="application/octet-stream")
57
-
58
- # Assuming this function is called in your app
59
- fetch_clean_organize_page()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
pages/file_web_source_collection.py DELETED
@@ -1,87 +0,0 @@
1
- import streamlit as st
2
- import pandas as pd
3
- import requests
4
- from bs4 import BeautifulSoup
5
- from urllib.parse import urljoin, urlparse
6
-
7
- def find_linked_urls(url):
8
- try:
9
- response = requests.get(url)
10
- if response.status_code == 200:
11
- soup = BeautifulSoup(response.text, 'html.parser')
12
- links = soup.find_all('a')
13
- urls = {link.get('href') for link in links if link.get('href') is not None}
14
- return urls
15
- else:
16
- st.write(f"Failed to retrieve {url}")
17
- except Exception as e:
18
- st.write(f"An error occurred with {url}: {e}")
19
- return set()
20
-
21
- def convert_to_absolute_urls(base_url, links):
22
- return {urljoin(base_url, link) if not link.startswith('http') else link for link in links}
23
-
24
- def categorize_links(base_url, links):
25
- internal_links, external_links = set(), set()
26
- for link in links:
27
- if urlparse(link).netloc == urlparse(base_url).netloc:
28
- internal_links.add(link)
29
- else:
30
- external_links.add(link)
31
- return internal_links, external_links
32
-
33
- def main():
34
- st.title("Data Source Configuration")
35
-
36
-
37
-
38
- if 'scanned_urls' not in st.session_state:
39
- st.session_state['scanned_urls'] = {}
40
-
41
- st.subheader("Scan Websites for URLs")
42
- url_input = st.text_area("Enter URLs to scan, separated by new lines:")
43
- url_list = [url.strip() for url in url_input.strip().split('\n') if url.strip()] # Splitting and cleaning input
44
-
45
- scan_button_clicked = st.button("Scan URLs")
46
- if scan_button_clicked or st.session_state['scanned_urls']:
47
- if scan_button_clicked:
48
- for url in url_list:
49
- unique_urls = find_linked_urls(url)
50
- absolute_urls = convert_to_absolute_urls(url, unique_urls)
51
- internal_links, external_links = categorize_links(url, absolute_urls)
52
- st.session_state['scanned_urls'][url] = {"internal": internal_links, "external": external_links}
53
-
54
- selected_urls = []
55
- for base_url, links in st.session_state['scanned_urls'].items():
56
- st.write(f"Base URL: {base_url}")
57
- include_all_internal = st.checkbox(f"Include all internal links from {base_url}", key=f"all_{base_url}")
58
-
59
- if include_all_internal:
60
- selected_urls.extend(links["internal"])
61
- else:
62
- selected_internal = [link for link in links["internal"] if st.checkbox(link, key=link)]
63
- selected_urls.extend(selected_internal)
64
-
65
- if links["external"]:
66
- st.write("External links:")
67
- for link in links["external"]:
68
- st.write(link)
69
-
70
- if selected_urls:
71
- df_selected_urls = pd.DataFrame(selected_urls, columns=['Selected URLs'])
72
- st.write(df_selected_urls)
73
-
74
- st.session_state['selected_urls'] = df_selected_urls
75
-
76
- # Convert DataFrame to CSV for download
77
- csv = df_selected_urls.to_csv(index=False).encode('utf-8')
78
-
79
- st.download_button(
80
- label="Download selected URLs as CSV",
81
- data=csv,
82
- file_name='selected_urls.csv',
83
- mime='text/csv',
84
- )
85
-
86
- if __name__ == "__main__":
87
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
pages/model_selection.py DELETED
@@ -1,52 +0,0 @@
1
- import streamlit as st
2
-
3
- def main():
4
- st.title("Model Selection and Configuration")
5
-
6
- # Introduction
7
- st.write("Select the embedding model and the large language model (LLM) for processing.")
8
-
9
- # Embedding Model Selection
10
- embedding_models = ["thenlper/gte-small", "sentence-transformers/all-MiniLM-L6-v2", "other"]
11
- selected_embedding_model = st.selectbox("Select Embedding Model", options=embedding_models)
12
-
13
- # LLM Model Selection
14
- llm_models = ["mistralai/Mistral-7B-Instruct-v0.2", "gpt-3.5-turbo", "other"]
15
- selected_llm_model = st.selectbox("Select LLM Model", options=llm_models)
16
-
17
- # Display selections (for demonstration)
18
- st.write("Selected Embedding Model:", selected_embedding_model)
19
- st.write("Selected LLM Model:", selected_llm_model)
20
-
21
- # Configuration options for the selected models
22
- st.header("Model Configuration")
23
-
24
- # Embedding Model Configuration (example)
25
- if selected_embedding_model == "thenlper/gte-small":
26
- # Placeholder for model-specific configuration options
27
- st.write("No additional configuration required for this model.")
28
- else:
29
- # Configuration for other models
30
- st.write("Configuration options for other models will appear here.")
31
-
32
- # LLM Model Configuration (example)
33
- if selected_llm_model == "mistralai/Mistral-7B-Instruct-v0.2":
34
- max_tokens = st.slider("Max Tokens", min_value=100, max_value=1000, value=250)
35
- temperature = st.slider("Temperature", min_value=0.0, max_value=1.0, value=0.7, step=0.01)
36
- else:
37
- # Configuration for other models
38
- st.write("Configuration options for other models will appear here.")
39
-
40
- # Save model selections and configurations
41
- if st.button("Save Model Configuration"):
42
- st.session_state['selected_embedding_model'] = selected_embedding_model
43
- st.session_state['selected_llm_model'] = selected_llm_model
44
- # Assuming configurations are more complex and vary per model, you might want to store them differently
45
- st.session_state['llm_model_config'] = {"max_tokens": max_tokens, "temperature": temperature}
46
- st.success("Model configurations saved.")
47
-
48
- # Optional: Proceed to the next step
49
- # st.session_state.page = 'processing_embedding'
50
-
51
- if __name__ == "__main__":
52
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
pages/processing_embedding.py DELETED
@@ -1,53 +0,0 @@
1
- import streamlit as st
2
- import time # Used to simulate processing time
3
-
4
- def main():
5
- st.title("Processing and Embedding")
6
-
7
- # Introduction
8
- st.write("Process the selected data using the configured models and save the results.")
9
-
10
- # Start Processing Button
11
- if st.button("Start Processing"):
12
- with st.spinner("Processing..."):
13
- # Simulate processing time
14
- time.sleep(5) # Simulate processing time. Replace with actual processing logic.
15
-
16
- # Update session state or variables to indicate processing is complete
17
- st.session_state['processing_complete'] = True
18
- st.success("Processing completed successfully!")
19
-
20
- # Show progress only if processing has started or completed
21
- if 'processing_complete' in st.session_state and st.session_state['processing_complete']:
22
- st.progress(100)
23
- st.write("Data has been processed and embedded successfully.")
24
- else:
25
- st.progress(0)
26
-
27
- # Parameter Tuning Section (Placeholder)
28
- st.header("Parameter Tuning")
29
- st.write("Adjust processing parameters if necessary. (This section is a placeholder.)")
30
-
31
- # Saving Options
32
- st.header("Save Results")
33
- if st.checkbox("Save Preprocessed Pages"):
34
- # Placeholder for saving logic
35
- st.write("Preprocessed pages will be saved.")
36
- if st.checkbox("Save Processed Pages"):
37
- # Placeholder for saving logic
38
- st.write("Processed pages will be saved.")
39
- if st.checkbox("Save Vectors Store Data"):
40
- # Placeholder for saving logic
41
- st.write("Vectors store data will be saved.")
42
-
43
- # Optional: Provide navigation to next steps or back to configuration
44
- col1, col2 = st.columns(2)
45
- with col1:
46
- if st.button("Back to Model Selection"):
47
- st.session_state.page = 'model_selection'
48
- with col2:
49
- if st.button("Complete and Exit"):
50
- st.session_state.page = 'main_page' # Assuming you want to navigate back to the main page
51
-
52
- if __name__ == "__main__":
53
- main()