Spaces:
Running
Running
| import os | |
| from urllib.parse import urlparse | |
| import pandas as pd | |
| import gradio as gr | |
| CSV_OUTPUT_PATH = "url_keywords_full.csv" | |
| def extract_keywords_from_url(url: str) -> list[str]: | |
| """ | |
| Strip the domain, remove .html/.htm/.php, replace delimiters with spaces, | |
| split on '/', then on whitespace, lowercase each token, and return the list. | |
| """ | |
| parsed = urlparse(url) | |
| path = parsed.path.strip("/") | |
| for ext in (".html", ".htm", ".php"): | |
| if path.endswith(ext): | |
| path = path[: -len(ext)] | |
| cleaned = path.replace("_", " ").replace("-", " ").replace(".", " ") | |
| segments = cleaned.split("/") if cleaned else [] | |
| keywords: list[str] = [] | |
| for seg in segments: | |
| for token in seg.split(): | |
| tok = token.strip().lower() | |
| if tok: | |
| keywords.append(tok) | |
| return keywords | |
| def process_urls(input_text: str): | |
| """ | |
| Given a multiline string of URLs (one per line), extract keywords for each URL, | |
| build a DataFrame with columns ['url', 'keywords'], save it as CSV, and return both. | |
| """ | |
| urls = [line.strip() for line in input_text.splitlines() if line.strip()] | |
| results = [] | |
| for url in urls: | |
| kws = extract_keywords_from_url(url) | |
| results.append({"url": url, "keywords": ", ".join(kws)}) | |
| df = pd.DataFrame(results, columns=["url", "keywords"]) | |
| df.to_csv(CSV_OUTPUT_PATH, index=False, encoding="utf-8") | |
| return df, CSV_OUTPUT_PATH | |
| # Aggressive CSS override: everything white bg + black text, | |
| # then re-style buttons to keep accent color + white text. | |
| custom_css = """ | |
| /* 1) Force every element inside the Gradio container to white bg + black text */ | |
| .gradio-container * { | |
| background-color: #ffffff !important; | |
| color: #000000 !important; | |
| border-color: #cccccc !important; | |
| } | |
| /* 2) Restore button accent + white text */ | |
| .gradio-container .gr-button, | |
| .gradio-container .gr-button:hover, | |
| .gradio-container .gr-button:focus { | |
| background-color: #1f6feb !important; | |
| color: #ffffff !important; | |
| border-color: #1f6feb !important; | |
| } | |
| """ | |
| with gr.Blocks(theme=gr.themes.Default(), css=custom_css) as demo: | |
| gr.Markdown("## URL Keywords Extractor by DEJAN") | |
| gr.Markdown( | |
| "Enter one or more URLs (one per line) below, then click **Generate** to see extracted keywords." | |
| ) | |
| url_input = gr.Textbox( | |
| lines=5, | |
| placeholder="https://dejan.ai/labs/interactive-demo\nhttps://example.com/foo-bar", | |
| label="Enter URLs (one per line)", | |
| ) | |
| generate_btn = gr.Button("Generate", variant="primary") | |
| df_output = gr.Dataframe( | |
| headers=["url", "keywords"], | |
| label="Extracted URL Keywords", | |
| interactive=False, | |
| ) | |
| download_csv = gr.File(label="EXPORT CSV", file_types=[".csv"]) | |
| generate_btn.click(fn=process_urls, inputs=url_input, outputs=[df_output, download_csv]) | |
| if __name__ == "__main__": | |
| demo.launch() | |