Spaces:

darwincb
/

jan-v1-research

Paused

App Files Files Community

darwincb commited on Aug 21

Commit

4b392a8

1 Parent(s): ab4eb68

Add Jan v1 Research Assistant with web scraping, multi-source analysis, and entity extraction

Browse files

Files changed (2) hide show

app.py +406 -0
requirements.txt +10 -0

app.py ADDED Viewed

	@@ -0,0 +1,406 @@

+"""
+Jan v1 Research Assistant for Hugging Face Spaces
+Optimized for research tasks and source analysis
+"""
+import gradio as gr
+from transformers import AutoModelForCausalLM, AutoTokenizer
+import torch
+import requests
+from bs4 import BeautifulSoup
+import json
+from datetime import datetime
+from typing import List, Dict, Optional
+import hashlib
+# Initialize model
+print("🚀 Loading Jan v1 model...")
+model_name = "janhq/Jan-v1-4B"
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+model = AutoModelForCausalLM.from_pretrained(
+    model_name,
+    torch_dtype=torch.bfloat16,
+    device_map="auto",
+    load_in_8bit=True  # Reduce memory usage
+)
+print("✅ Model loaded successfully!")
+# Cache for responses
+response_cache = {}
+def get_cache_key(query: str, context: str) -> str:
+    """Generate cache key for query+context"""
+    combined = f"{query}|{context}"
+    return hashlib.md5(combined.encode()).hexdigest()
+def scrape_url(url: str) -> str:
+    """Scrape and extract text from URL"""
+    try:
+        headers = {
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
+        }
+        response = requests.get(url, headers=headers, timeout=10)
+        soup = BeautifulSoup(response.content, 'html.parser')
+        # Remove script and style elements
+        for script in soup(["script", "style"]):
+            script.decompose()
+        text = soup.get_text()
+        lines = (line.strip() for line in text.splitlines())
+        chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
+        text = ' '.join(chunk for chunk in chunks if chunk)
+        return text[:4000]  # Limit to 4000 chars
+    except Exception as e:
+        return f"Error scraping URL: {str(e)}"
+def research_assistant(
+    query: str,
+    context: str = "",
+    temperature: float = 0.6,
+    use_cache: bool = True,
+    research_mode: str = "comprehensive"
+) -> str:
+    """
+    Main research assistant function
+    """
+    # Check cache
+    cache_key = get_cache_key(query, context)
+    if use_cache and cache_key in response_cache:
+        return "📌 [Cached] " + response_cache[cache_key]
+    # Build prompt based on research mode
+    if research_mode == "comprehensive":
+        prompt = f"""You are an expert research analyst. Provide comprehensive analysis.
+Context/Sources:
+{context if context else "No specific context provided"}
+Research Query:
+{query}
+Provide your analysis with:
+1. Key Findings & Insights
+2. Supporting Evidence
+3. Critical Analysis
+4. Confidence Level
+5. Suggested Follow-up Questions
+6. Potential Limitations
+Analysis:"""
+    elif research_mode == "fact_extraction":
+        prompt = f"""Extract and verify factual information.
+Source Material:
+{context}
+Task: {query}
+Extract:
+- Factual claims with confidence scores (0-100%)
+- Key entities and relationships
+- Dates, numbers, and statistics
+- Contradictions or inconsistencies
+Facts:"""
+    elif research_mode == "source_comparison":
+        prompt = f"""Compare and contrast multiple sources.
+Sources:
+{context}
+Comparison Task: {query}
+Analyze:
+- Common themes
+- Contradictions
+- Unique perspectives
+- Reliability assessment
+- Synthesis
+Comparison:"""
+    else:  # quick_summary
+        prompt = f"""Provide a quick summary.
+Content: {context}
+Task: {query}
+Summary:"""
+    # Tokenize and generate
+    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=2048)
+    with torch.no_grad():
+        outputs = model.generate(
+            **inputs,
+            max_new_tokens=1024,
+            temperature=temperature,
+            top_p=0.95,
+            top_k=20,
+            do_sample=True,
+            pad_token_id=tokenizer.eos_token_id
+        )
+    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
+    # Remove the prompt from response
+    response = response.replace(prompt, "").strip()
+    # Cache the response
+    if use_cache:
+        response_cache[cache_key] = response
+    return response
+def process_multiple_sources(sources_text: str, query: str, temperature: float = 0.6) -> str:
+    """Process multiple sources (URLs or text)"""
+    sources = sources_text.strip().split('\n')
+    combined_context = ""
+    source_count = 0
+    for source in sources[:5]:  # Limit to 5 sources
+        source = source.strip()
+        if not source:
+            continue
+        source_count += 1
+        if source.startswith('http'):
+            content = scrape_url(source)
+            combined_context += f"\n\n--- Source {source_count} (URL: {source[:50]}...) ---\n{content[:800]}"
+        else:
+            combined_context += f"\n\n--- Source {source_count} (Text) ---\n{source[:800]}"
+    if not combined_context:
+        return "No valid sources provided"
+    return research_assistant(
+        query=query,
+        context=combined_context,
+        temperature=temperature,
+        research_mode="source_comparison"
+    )
+def extract_entities(text: str) -> str:
+    """Extract key entities from text"""
+    return research_assistant(
+        query="Extract all people, organizations, locations, dates, and key concepts",
+        context=text,
+        temperature=0.3,
+        research_mode="fact_extraction"
+    )
+def generate_research_questions(topic: str, context: str = "") -> str:
+    """Generate research questions for a topic"""
+    return research_assistant(
+        query=f"Generate 10 specific, actionable research questions about: {topic}",
+        context=context,
+        temperature=0.7,
+        research_mode="comprehensive"
+    )
+# Create Gradio interface
+with gr.Blocks(title="Jan v1 Research Assistant", theme=gr.themes.Soft()) as demo:
+    gr.Markdown("""
+    # 🔬 Jan v1 Research Assistant
+    Powered by Jan-v1-4B (91.1% accuracy) - Optimized for research and analysis
+    ### Features:
+    - 🌐 Web scraping and analysis
+    - 📊 Multi-source comparison
+    - 🔍 Entity extraction
+    - ❓ Research question generation
+    - 💾 Response caching
+    """)
+    with gr.Tab("Single Source Analysis"):
+        with gr.Row():
+            with gr.Column():
+                single_query = gr.Textbox(
+                    label="Research Query",
+                    placeholder="What would you like to research?",
+                    lines=2
+                )
+                single_context = gr.Textbox(
+                    label="Context (paste text or URL)",
+                    placeholder="Paste article text or enter URL to analyze",
+                    lines=5
+                )
+                single_mode = gr.Radio(
+                    ["comprehensive", "fact_extraction", "quick_summary"],
+                    label="Analysis Mode",
+                    value="comprehensive"
+                )
+                single_temp = gr.Slider(0.1, 1.0, value=0.6, label="Temperature")
+                single_cache = gr.Checkbox(label="Use cache", value=True)
+                single_btn = gr.Button("🔍 Analyze", variant="primary")
+            with gr.Column():
+                single_output = gr.Textbox(
+                    label="Analysis Results",
+                    lines=15
+                )
+        def analyze_single(query, context, mode, temp, cache):
+            # Check if context is URL
+            if context.startswith('http'):
+                context = scrape_url(context)
+            return research_assistant(
+                query=query,
+                context=context,
+                temperature=temp,
+                use_cache=cache,
+                research_mode=mode
+            )
+        single_btn.click(
+            analyze_single,
+            inputs=[single_query, single_context, single_mode, single_temp, single_cache],
+            outputs=single_output
+        )
+    with gr.Tab("Multi-Source Comparison"):
+        with gr.Row():
+            with gr.Column():
+                multi_sources = gr.Textbox(
+                    label="Sources (one per line, URLs or text)",
+                    placeholder="https://example.com/article1\nhttps://example.com/article2\nOr paste text directly",
+                    lines=6
+                )
+                multi_query = gr.Textbox(
+                    label="Comparison Query",
+                    placeholder="What aspects should I compare?",
+                    lines=2
+                )
+                multi_temp = gr.Slider(0.1, 1.0, value=0.6, label="Temperature")
+                multi_btn = gr.Button("🔄 Compare Sources", variant="primary")
+            with gr.Column():
+                multi_output = gr.Textbox(
+                    label="Comparison Results",
+                    lines=15
+                )
+        multi_btn.click(
+            process_multiple_sources,
+            inputs=[multi_sources, multi_query, multi_temp],
+            outputs=multi_output
+        )
+    with gr.Tab("Entity Extraction"):
+        with gr.Row():
+            with gr.Column():
+                entity_input = gr.Textbox(
+                    label="Text or URL",
+                    placeholder="Paste text or URL to extract entities from",
+                    lines=8
+                )
+                entity_btn = gr.Button("🏷️ Extract Entities", variant="primary")
+            with gr.Column():
+                entity_output = gr.Textbox(
+                    label="Extracted Entities",
+                    lines=10
+                )
+        def extract_entities_wrapper(text):
+            if text.startswith('http'):
+                text = scrape_url(text)
+            return extract_entities(text)
+        entity_btn.click(
+            extract_entities_wrapper,
+            inputs=entity_input,
+            outputs=entity_output
+        )
+    with gr.Tab("Research Question Generator"):
+        with gr.Row():
+            with gr.Column():
+                rq_topic = gr.Textbox(
+                    label="Research Topic",
+                    placeholder="Enter your research topic",
+                    lines=2
+                )
+                rq_context = gr.Textbox(
+                    label="Additional Context (optional)",
+                    placeholder="Any specific focus areas or constraints",
+                    lines=4
+                )
+                rq_btn = gr.Button("💡 Generate Questions", variant="primary")
+            with gr.Column():
+                rq_output = gr.Textbox(
+                    label="Research Questions",
+                    lines=12
+                )
+        rq_btn.click(
+            generate_research_questions,
+            inputs=[rq_topic, rq_context],
+            outputs=rq_output
+        )
+    with gr.Tab("API Integration"):
+        gr.Markdown("""
+        ### 🔌 Integrate with your Research App
+        Once deployed, you can call this Space via API:
+        ```javascript
+        // JavaScript/TypeScript
+        const response = await fetch('https://[your-username].hf.space/api/predict', {
+            method: 'POST',
+            headers: { 'Content-Type': 'application/json' },
+            body: JSON.stringify({
+                data: [
+                    "Your research query",
+                    "Context or URL",
+                    "comprehensive",  // mode
+                    0.6,  // temperature
+                    true  // use cache
+                ]
+            })
+        });
+        const result = await response.json();
+        ```
+        ```python
+        # Python
+        import requests
+        response = requests.post(
+            'https://[your-username].hf.space/api/predict',
+            json={
+                "data": [
+                    "Your research query",
+                    "Context or URL",
+                    "comprehensive",
+                    0.6,
+                    True
+                ]
+            }
+        )
+        result = response.json()
+        ```
+        """)
+    gr.Markdown("""
+    ---
+    ### 💡 Tips:
+    - Lower temperature (0.1-0.3) for factual extraction
+    - Higher temperature (0.7-0.9) for creative research questions
+    - Cache is cleared when Space restarts
+    - URLs are automatically scraped and analyzed
+    """)
+if __name__ == "__main__":
+    demo.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        share=False
+    )

requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+# Jan v1 Research Assistant Requirements
+transformers==4.36.2
+torch==2.1.2
+gradio==4.19.2
+accelerate==0.25.0
+bitsandbytes==0.42.0
+sentencepiece==0.1.99
+beautifulsoup4==4.12.3
+requests==2.31.0
+lxml==5.1.0