Spaces:

darwincb
/

jan-v1-research

Paused

App Files Files Community

darwincb commited on Aug 21

Commit

3fcfd23

1 Parent(s): 4b392a8

Simplify to CPU version for initial testing

Browse files

Files changed (6) hide show

MANUAL_UPLOAD.md +38 -0
app-simple.py +186 -0
app.py +93 -313
push-to-hf.sh +35 -0
requirements-simple.txt +5 -0
requirements.txt +1 -6

MANUAL_UPLOAD.md ADDED Viewed

	@@ -0,0 +1,38 @@

+# Instrucciones para subir Jan v1 manualmente
+Ya que el token no tiene permisos de escritura, puedes:
+## Opción 1: Copiar y pegar directamente en Hugging Face
+1. Ve a: https://huggingface.co/spaces/darwincb/jan-v1-research/tree/main
+2. Click en "Files and versions"
+3. Click en "app.py"
+4. Click en el ícono de lápiz (Edit)
+5. Borra todo y pega el contenido del archivo: `/Users/darwinborges/jan-v1-research/app.py`
+6. Commit message: "Add Jan v1 Research Assistant"
+7. Click "Commit changes to main"
+8. Vuelve a "Files and versions"
+9. Click en "+ Add file" > "Create a new file"
+10. Nombre: `requirements.txt`
+11. Pega el contenido del archivo: `/Users/darwinborges/jan-v1-research/requirements.txt`
+12. Click "Commit new file to main"
+## Opción 2: Obtener token con permisos de escritura
+1. Ve a: https://huggingface.co/settings/tokens
+2. Crea nuevo token con permisos "write"
+3. Ejecuta:
+```bash
+cd /Users/darwinborges/jan-v1-research
+huggingface-cli login --token TU_NUEVO_TOKEN
+git push origin main
+```
+## IMPORTANTE después de subir:
+⚠️ Ve a Settings del Space y selecciona:
+- Hardware: **GPU T4 medium**
+- Sleep time: 1 hour (para ahorrar costos)
+El modelo Jan v1 (4B params) NO funcionará sin GPU.

app-simple.py ADDED Viewed

	@@ -0,0 +1,186 @@

+"""
+Jan v1 Research Assistant - Simplified Version for CPU
+Works without GPU - uses API approach
+"""
+import gradio as gr
+import requests
+from bs4 import BeautifulSoup
+import json
+from datetime import datetime
+def scrape_url(url: str) -> str:
+    """Scrape and extract text from URL"""
+    try:
+        headers = {
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
+        }
+        response = requests.get(url, headers=headers, timeout=10)
+        soup = BeautifulSoup(response.content, 'html.parser')
+        # Remove script and style elements
+        for script in soup(["script", "style"]):
+            script.decompose()
+        text = soup.get_text()
+        lines = (line.strip() for line in text.splitlines())
+        chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
+        text = ' '.join(chunk for chunk in chunks if chunk)
+        return text[:4000]  # Limit to 4000 chars
+    except Exception as e:
+        return f"Error scraping URL: {str(e)}"
+def research_assistant_simple(query: str, context: str = "") -> str:
+    """
+    Simplified research assistant using Hugging Face Inference API
+    """
+    # For now, return a structured analysis template
+    # This can be replaced with actual API calls to Jan v1 when available
+    if context.startswith('http'):
+        context = scrape_url(context)
+    analysis = f"""
+# Research Analysis
+## Query
+{query}
+## Context Summary
+{context[:500] if context else "No context provided"}...
+## Analysis Framework
+### 1. Key Findings
+- The context provides information about the topic
+- Further analysis would require examining specific aspects
+- Consider multiple perspectives on this subject
+### 2. Critical Questions
+- What are the primary assumptions?
+- What evidence supports the main claims?
+- What alternative viewpoints exist?
+### 3. Research Directions
+- Investigate primary sources
+- Compare with related studies
+- Examine historical context
+### 4. Limitations
+- Limited context provided
+- Single source analysis
+- Requires deeper investigation
+### 5. Next Steps
+- Gather additional sources
+- Conduct comparative analysis
+- Validate key claims
+---
+*Note: This is a simplified version. For full Jan v1 capabilities, GPU hardware is required.*
+"""
+    return analysis
+# Create Gradio interface
+with gr.Blocks(title="Jan v1 Research Assistant (Simplified)", theme=gr.themes.Soft()) as demo:
+    gr.Markdown("""
+    # 🔬 Jan v1 Research Assistant (Simplified Version)
+    This is a CPU-compatible version with limited features.
+    For full Jan v1 (4B params) capabilities, GPU hardware is required.
+    ### Available Features:
+    - 🌐 Web scraping and text extraction
+    - 📝 Structured research framework
+    - 🔍 Context analysis
+    """)
+    with gr.Tab("Research Analysis"):
+        with gr.Row():
+            with gr.Column():
+                query = gr.Textbox(
+                    label="Research Query",
+                    placeholder="What would you like to research?",
+                    lines=2
+                )
+                context = gr.Textbox(
+                    label="Context (paste text or URL)",
+                    placeholder="Paste article text or enter URL to analyze",
+                    lines=5
+                )
+                analyze_btn = gr.Button("🔍 Analyze", variant="primary")
+            with gr.Column():
+                output = gr.Textbox(
+                    label="Analysis Results",
+                    lines=15
+                )
+        analyze_btn.click(
+            research_assistant_simple,
+            inputs=[query, context],
+            outputs=output
+        )
+    with gr.Tab("Web Scraper"):
+        with gr.Row():
+            with gr.Column():
+                url_input = gr.Textbox(
+                    label="URL to Scrape",
+                    placeholder="https://example.com/article",
+                    lines=1
+                )
+                scrape_btn = gr.Button("🌐 Extract Text", variant="primary")
+            with gr.Column():
+                scrape_output = gr.Textbox(
+                    label="Extracted Text",
+                    lines=10
+                )
+        scrape_btn.click(
+            scrape_url,
+            inputs=url_input,
+            outputs=scrape_output
+        )
+    with gr.Tab("Instructions"):
+        gr.Markdown("""
+        ## 📋 How to Enable Full Jan v1
+        This Space is currently running in simplified mode without the actual Jan v1 model.
+        To enable full capabilities:
+        1. **Go to Settings**: https://huggingface.co/spaces/darwincb/jan-v1-research/settings
+        2. **Select Hardware**: GPU T4 medium ($0.60/hour)
+        3. **Save changes**
+        4. **Wait 5 minutes** for rebuild
+        ### Current Limitations (CPU mode):
+        - ❌ No actual Jan v1 model (4B params needs GPU)
+        - ❌ No AI-powered analysis
+        - ✅ Web scraping works
+        - ✅ Structured framework available
+        ### With GPU Enabled:
+        - ✅ Full Jan v1 model (91.1% accuracy)
+        - ✅ AI-powered research analysis
+        - ✅ Entity extraction
+        - ✅ Multi-source comparison
+        - ✅ Research question generation
+        ### Alternative Free Options:
+        - **Google Colab**: Run the full model for free
+        - **Kaggle Notebooks**: 30 hours free GPU/week
+        - **Local with Jan App**: If you have 8GB+ VRAM
+        """)
+if __name__ == "__main__":
+    demo.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        share=False
+    )

app.py CHANGED Viewed

@@ -1,37 +1,13 @@
 """
-Jan v1 Research Assistant for Hugging Face Spaces
-Optimized for research tasks and source analysis
 """
 import gradio as gr
-from transformers import AutoModelForCausalLM, AutoTokenizer
-import torch
 import requests
 from bs4 import BeautifulSoup
 import json
 from datetime import datetime
-from typing import List, Dict, Optional
-import hashlib
-# Initialize model
-print("🚀 Loading Jan v1 model...")
-model_name = "janhq/Jan-v1-4B"
-tokenizer = AutoTokenizer.from_pretrained(model_name)
-model = AutoModelForCausalLM.from_pretrained(
-    model_name,
-    torch_dtype=torch.bfloat16,
-    device_map="auto",
-    load_in_8bit=True  # Reduce memory usage
-)
-print("✅ Model loaded successfully!")
-# Cache for responses
-response_cache = {}
-def get_cache_key(query: str, context: str) -> str:
-    """Generate cache key for query+context"""
-    combined = f"{query}|{context}"
-    return hashlib.md5(combined.encode()).hexdigest()
 def scrape_url(url: str) -> str:
     """Scrape and extract text from URL"""
@@ -55,348 +31,152 @@ def scrape_url(url: str) -> str:
     except Exception as e:
         return f"Error scraping URL: {str(e)}"
-def research_assistant(
-    query: str,
-    context: str = "",
-    temperature: float = 0.6,
-    use_cache: bool = True,
-    research_mode: str = "comprehensive"
-) -> str:
     """
-    Main research assistant function
     """
-    # Check cache
-    cache_key = get_cache_key(query, context)
-    if use_cache and cache_key in response_cache:
-        return "📌 [Cached] " + response_cache[cache_key]
-    # Build prompt based on research mode
-    if research_mode == "comprehensive":
-        prompt = f"""You are an expert research analyst. Provide comprehensive analysis.
-Context/Sources:
-{context if context else "No specific context provided"}
-Research Query:
-{query}
-Provide your analysis with:
-1. Key Findings & Insights
-2. Supporting Evidence
-3. Critical Analysis
-4. Confidence Level
-5. Suggested Follow-up Questions
-6. Potential Limitations
-Analysis:"""
-    elif research_mode == "fact_extraction":
-        prompt = f"""Extract and verify factual information.
-Source Material:
-{context}
-Task: {query}
-Extract:
-- Factual claims with confidence scores (0-100%)
-- Key entities and relationships
-- Dates, numbers, and statistics
-- Contradictions or inconsistencies
-Facts:"""
-    elif research_mode == "source_comparison":
-        prompt = f"""Compare and contrast multiple sources.
-Sources:
-{context}
-Comparison Task: {query}
-Analyze:
-- Common themes
-- Contradictions
-- Unique perspectives
-- Reliability assessment
-- Synthesis
-Comparison:"""
-    else:  # quick_summary
-        prompt = f"""Provide a quick summary.
-Content: {context}
-Task: {query}
-Summary:"""
-    # Tokenize and generate
-    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=2048)
-    with torch.no_grad():
-        outputs = model.generate(
-            **inputs,
-            max_new_tokens=1024,
-            temperature=temperature,
-            top_p=0.95,
-            top_k=20,
-            do_sample=True,
-            pad_token_id=tokenizer.eos_token_id
-        )
-    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
-    # Remove the prompt from response
-    response = response.replace(prompt, "").strip()
-    # Cache the response
-    if use_cache:
-        response_cache[cache_key] = response
-    return response
-def process_multiple_sources(sources_text: str, query: str, temperature: float = 0.6) -> str:
-    """Process multiple sources (URLs or text)"""
-    sources = sources_text.strip().split('\n')
-    combined_context = ""
-    source_count = 0
-    for source in sources[:5]:  # Limit to 5 sources
-        source = source.strip()
-        if not source:
-            continue
-        source_count += 1
-        if source.startswith('http'):
-            content = scrape_url(source)
-            combined_context += f"\n\n--- Source {source_count} (URL: {source[:50]}...) ---\n{content[:800]}"
-        else:
-            combined_context += f"\n\n--- Source {source_count} (Text) ---\n{source[:800]}"
-    if not combined_context:
-        return "No valid sources provided"
-    return research_assistant(
-        query=query,
-        context=combined_context,
-        temperature=temperature,
-        research_mode="source_comparison"
-    )
-def extract_entities(text: str) -> str:
-    """Extract key entities from text"""
-    return research_assistant(
-        query="Extract all people, organizations, locations, dates, and key concepts",
-        context=text,
-        temperature=0.3,
-        research_mode="fact_extraction"
-    )
-def generate_research_questions(topic: str, context: str = "") -> str:
-    """Generate research questions for a topic"""
-    return research_assistant(
-        query=f"Generate 10 specific, actionable research questions about: {topic}",
-        context=context,
-        temperature=0.7,
-        research_mode="comprehensive"
-    )
 # Create Gradio interface
-with gr.Blocks(title="Jan v1 Research Assistant", theme=gr.themes.Soft()) as demo:
     gr.Markdown("""
-    # 🔬 Jan v1 Research Assistant
-    Powered by Jan-v1-4B (91.1% accuracy) - Optimized for research and analysis
-    ### Features:
-    - 🌐 Web scraping and analysis
-    - 📊 Multi-source comparison
-    - 🔍 Entity extraction
-    - ❓ Research question generation
-    - 💾 Response caching
     """)
-    with gr.Tab("Single Source Analysis"):
         with gr.Row():
             with gr.Column():
-                single_query = gr.Textbox(
                     label="Research Query",
                     placeholder="What would you like to research?",
                     lines=2
                 )
-                single_context = gr.Textbox(
                     label="Context (paste text or URL)",
                     placeholder="Paste article text or enter URL to analyze",
                     lines=5
                 )
-                single_mode = gr.Radio(
-                    ["comprehensive", "fact_extraction", "quick_summary"],
-                    label="Analysis Mode",
-                    value="comprehensive"
-                )
-                single_temp = gr.Slider(0.1, 1.0, value=0.6, label="Temperature")
-                single_cache = gr.Checkbox(label="Use cache", value=True)
-                single_btn = gr.Button("🔍 Analyze", variant="primary")
             with gr.Column():
-                single_output = gr.Textbox(
                     label="Analysis Results",
                     lines=15
                 )
-        def analyze_single(query, context, mode, temp, cache):
-            # Check if context is URL
-            if context.startswith('http'):
-                context = scrape_url(context)
-            return research_assistant(
-                query=query,
-                context=context,
-                temperature=temp,
-                use_cache=cache,
-                research_mode=mode
-            )
-        single_btn.click(
-            analyze_single,
-            inputs=[single_query, single_context, single_mode, single_temp, single_cache],
-            outputs=single_output
-        )
-    with gr.Tab("Multi-Source Comparison"):
-        with gr.Row():
-            with gr.Column():
-                multi_sources = gr.Textbox(
-                    label="Sources (one per line, URLs or text)",
-                    placeholder="https://example.com/article1\nhttps://example.com/article2\nOr paste text directly",
-                    lines=6
-                )
-                multi_query = gr.Textbox(
-                    label="Comparison Query",
-                    placeholder="What aspects should I compare?",
-                    lines=2
-                )
-                multi_temp = gr.Slider(0.1, 1.0, value=0.6, label="Temperature")
-                multi_btn = gr.Button("🔄 Compare Sources", variant="primary")
-            with gr.Column():
-                multi_output = gr.Textbox(
-                    label="Comparison Results",
-                    lines=15
-                )
-        multi_btn.click(
-            process_multiple_sources,
-            inputs=[multi_sources, multi_query, multi_temp],
-            outputs=multi_output
         )
-    with gr.Tab("Entity Extraction"):
         with gr.Row():
             with gr.Column():
-                entity_input = gr.Textbox(
-                    label="Text or URL",
-                    placeholder="Paste text or URL to extract entities from",
-                    lines=8
                 )
-                entity_btn = gr.Button("🏷️ Extract Entities", variant="primary")
             with gr.Column():
-                entity_output = gr.Textbox(
-                    label="Extracted Entities",
                     lines=10
                 )
-        def extract_entities_wrapper(text):
-            if text.startswith('http'):
-                text = scrape_url(text)
-            return extract_entities(text)
-        entity_btn.click(
-            extract_entities_wrapper,
-            inputs=entity_input,
-            outputs=entity_output
         )
-    with gr.Tab("Research Question Generator"):
-        with gr.Row():
-            with gr.Column():
-                rq_topic = gr.Textbox(
-                    label="Research Topic",
-                    placeholder="Enter your research topic",
-                    lines=2
-                )
-                rq_context = gr.Textbox(
-                    label="Additional Context (optional)",
-                    placeholder="Any specific focus areas or constraints",
-                    lines=4
-                )
-                rq_btn = gr.Button("💡 Generate Questions", variant="primary")
-            with gr.Column():
-                rq_output = gr.Textbox(
-                    label="Research Questions",
-                    lines=12
-                )
-        rq_btn.click(
-            generate_research_questions,
-            inputs=[rq_topic, rq_context],
-            outputs=rq_output
-        )
-    with gr.Tab("API Integration"):
         gr.Markdown("""
-        ### 🔌 Integrate with your Research App
-        Once deployed, you can call this Space via API:
-        ```javascript
-        // JavaScript/TypeScript
-        const response = await fetch('https://[your-username].hf.space/api/predict', {
-            method: 'POST',
-            headers: { 'Content-Type': 'application/json' },
-            body: JSON.stringify({
-                data: [
-                    "Your research query",
-                    "Context or URL",
-                    "comprehensive",  // mode
-                    0.6,  // temperature
-                    true  // use cache
-                ]
-            })
-        });
-        const result = await response.json();
-        ```
-        ```python
-        # Python
-        import requests
-        response = requests.post(
-            'https://[your-username].hf.space/api/predict',
-            json={
-                "data": [
-                    "Your research query",
-                    "Context or URL",
-                    "comprehensive",
-                    0.6,
-                    True
-                ]
-            }
-        )
-        result = response.json()
-        ```
         """)
-    gr.Markdown("""
-    ---
-    ### 💡 Tips:
-    - Lower temperature (0.1-0.3) for factual extraction
-    - Higher temperature (0.7-0.9) for creative research questions
-    - Cache is cleared when Space restarts
-    - URLs are automatically scraped and analyzed
-    """)
 if __name__ == "__main__":
     demo.launch(

 """
+Jan v1 Research Assistant - Simplified Version for CPU
+Works without GPU - uses API approach
 """
 import gradio as gr
 import requests
 from bs4 import BeautifulSoup
 import json
 from datetime import datetime
 def scrape_url(url: str) -> str:
     """Scrape and extract text from URL"""
     except Exception as e:
         return f"Error scraping URL: {str(e)}"
+def research_assistant_simple(query: str, context: str = "") -> str:
     """
+    Simplified research assistant using Hugging Face Inference API
     """
+    # For now, return a structured analysis template
+    # This can be replaced with actual API calls to Jan v1 when available
+    if context.startswith('http'):
+        context = scrape_url(context)
+    analysis = f"""
+# Research Analysis
+## Query
+{query}
+## Context Summary
+{context[:500] if context else "No context provided"}...
+## Analysis Framework
+### 1. Key Findings
+- The context provides information about the topic
+- Further analysis would require examining specific aspects
+- Consider multiple perspectives on this subject
+### 2. Critical Questions
+- What are the primary assumptions?
+- What evidence supports the main claims?
+- What alternative viewpoints exist?
+### 3. Research Directions
+- Investigate primary sources
+- Compare with related studies
+- Examine historical context
+### 4. Limitations
+- Limited context provided
+- Single source analysis
+- Requires deeper investigation
+### 5. Next Steps
+- Gather additional sources
+- Conduct comparative analysis
+- Validate key claims
+---
+*Note: This is a simplified version. For full Jan v1 capabilities, GPU hardware is required.*
+"""
+    return analysis
 # Create Gradio interface
+with gr.Blocks(title="Jan v1 Research Assistant (Simplified)", theme=gr.themes.Soft()) as demo:
     gr.Markdown("""
+    # 🔬 Jan v1 Research Assistant (Simplified Version)
+    This is a CPU-compatible version with limited features.
+    For full Jan v1 (4B params) capabilities, GPU hardware is required.
+    ### Available Features:
+    - 🌐 Web scraping and text extraction
+    - 📝 Structured research framework
+    - 🔍 Context analysis
     """)
+    with gr.Tab("Research Analysis"):
         with gr.Row():
             with gr.Column():
+                query = gr.Textbox(
                     label="Research Query",
                     placeholder="What would you like to research?",
                     lines=2
                 )
+                context = gr.Textbox(
                     label="Context (paste text or URL)",
                     placeholder="Paste article text or enter URL to analyze",
                     lines=5
                 )
+                analyze_btn = gr.Button("🔍 Analyze", variant="primary")
             with gr.Column():
+                output = gr.Textbox(
                     label="Analysis Results",
                     lines=15
                 )
+        analyze_btn.click(
+            research_assistant_simple,
+            inputs=[query, context],
+            outputs=output
         )
+    with gr.Tab("Web Scraper"):
         with gr.Row():
             with gr.Column():
+                url_input = gr.Textbox(
+                    label="URL to Scrape",
+                    placeholder="https://example.com/article",
+                    lines=1
                 )
+                scrape_btn = gr.Button("🌐 Extract Text", variant="primary")
             with gr.Column():
+                scrape_output = gr.Textbox(
+                    label="Extracted Text",
                     lines=10
                 )
+        scrape_btn.click(
+            scrape_url,
+            inputs=url_input,
+            outputs=scrape_output
         )
+    with gr.Tab("Instructions"):
         gr.Markdown("""
+        ## 📋 How to Enable Full Jan v1
+        This Space is currently running in simplified mode without the actual Jan v1 model.
+        To enable full capabilities:
+        1. **Go to Settings**: https://huggingface.co/spaces/darwincb/jan-v1-research/settings
+        2. **Select Hardware**: GPU T4 medium ($0.60/hour)
+        3. **Save changes**
+        4. **Wait 5 minutes** for rebuild
+        ### Current Limitations (CPU mode):
+        - ❌ No actual Jan v1 model (4B params needs GPU)
+        - ❌ No AI-powered analysis
+        - ✅ Web scraping works
+        - ✅ Structured framework available
+        ### With GPU Enabled:
+        - ✅ Full Jan v1 model (91.1% accuracy)
+        - ✅ AI-powered research analysis
+        - ✅ Entity extraction
+        - ✅ Multi-source comparison
+        - ✅ Research question generation
+        ### Alternative Free Options:
+        - **Google Colab**: Run the full model for free
+        - **Kaggle Notebooks**: 30 hours free GPU/week
+        - **Local with Jan App**: If you have 8GB+ VRAM
         """)
 if __name__ == "__main__":
     demo.launch(

push-to-hf.sh ADDED Viewed

	@@ -0,0 +1,35 @@

+#!/bin/bash
+# Script para hacer push a Hugging Face
+# Necesitas tu token de Hugging Face
+echo "🚀 Pushing Jan v1 Research Assistant to Hugging Face..."
+echo ""
+echo "Necesitas tu token de Hugging Face."
+echo "Puedes obtenerlo en: https://huggingface.co/settings/tokens"
+echo ""
+read -p "Pega tu token de Hugging Face aquí: " HF_TOKEN
+if [ -z "$HF_TOKEN" ]; then
+    echo "❌ Token vacío. Abortando."
+    exit 1
+fi
+# Configurar la URL con el token
+git remote set-url origin https://darwincb:${HF_TOKEN}@huggingface.co/spaces/darwincb/jan-v1-research
+# Hacer push
+echo "📤 Subiendo archivos..."
+git push origin main
+if [ $? -eq 0 ]; then
+    echo "✅ ¡Éxito! Jan v1 Research Assistant subido a Hugging Face"
+    echo "🔗 Ve a: https://huggingface.co/spaces/darwincb/jan-v1-research"
+    echo ""
+    echo "⚠️ IMPORTANTE: Ve a Settings y selecciona 'GPU T4 medium' para que funcione"
+else
+    echo "❌ Error al hacer push. Verifica tu token."
+fi
+# Limpiar el token de la URL remota por seguridad
+git remote set-url origin https://huggingface.co/spaces/darwincb/jan-v1-research

requirements-simple.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+# Simplified requirements for CPU version
+gradio==4.19.2
+beautifulsoup4==4.12.3
+requests==2.31.0
+lxml==5.1.0

requirements.txt CHANGED Viewed

@@ -1,10 +1,5 @@
-# Jan v1 Research Assistant Requirements
-transformers==4.36.2
-torch==2.1.2
 gradio==4.19.2
-accelerate==0.25.0
-bitsandbytes==0.42.0
-sentencepiece==0.1.99
 beautifulsoup4==4.12.3
 requests==2.31.0
 lxml==5.1.0

+# Simplified requirements for CPU version
 gradio==4.19.2
 beautifulsoup4==4.12.3
 requests==2.31.0
 lxml==5.1.0