Spaces:

Manu-glitz
/

Extraction

Sleeping

App Files Files Community

glitz-dev commited on Feb 2

Commit

4466fc1

1 Parent(s): 1fd1e18

get_annotation with parameters of text and context added

Browse files

Files changed (4) hide show

biomed_annotator.py +326 -0
hipaathesis.py +47 -26
requirements.txt +14 -12
test_annotations_api.py +20 -0

biomed_annotator.py ADDED Viewed

	@@ -0,0 +1,326 @@

+import json
+import re
+import httpx
+from typing import Optional, List, Literal, Any, Dict
+from pydantic import BaseModel
+from tenacity import retry, stop_after_attempt, wait_exponential
+# --- 1. Schemas ---
+QuestionCategory = Literal["Clinical", "Mechanism", "Evidence", "Methods", "Limitations", "NextStep"]
+class GeneratedQuestion(BaseModel):
+    category: QuestionCategory
+    question: str
+    evidence_quote: str
+import os
+from dotenv import load_dotenv
+load_dotenv()
+# --- 2. Configuration ---
+def get_hf_token_from_cache() -> str:
+    """Get HuggingFace token from local cache (from huggingface-cli login)"""
+    try:
+        from huggingface_hub import HfFolder
+        token = HfFolder.get_token()
+        if token:
+            print("[DEBUG] Found HuggingFace token from local cache")
+            return token
+    except ImportError:
+        print("[DEBUG] huggingface_hub not installed, cannot get token from cache")
+    except Exception as e:
+        print(f"[DEBUG] Could not get HF token from cache: {e}")
+    return ""
+class Settings:
+    def __init__(self):
+        # LLM Provider: 'ollama', 'openai_compat', or 'huggingface'
+        self.llm_provider: str = os.getenv("LLM_PROVIDER", "huggingface")
+        # Ollama settings
+        self.ollama_base_url: str = os.getenv("OLLAMA_BASE_URL", "http://localhost:11434")
+        self.ollama_model: str = os.getenv("OLLAMA_MODEL", "llama3.2") #qwen2.5:3b-instruct, lama3.2
+        self.ollama_timeout_s: int = int(os.getenv("OLLAMA_TIMEOUT_S", 300))
+        # OpenAI Compat settings
+        self.openai_compat_base_url: str = os.getenv("OPENAI_COMPAT_BASE_URL", "http://localhost:8080/v1")
+        self.openai_compat_model: str = os.getenv("OPENAI_COMPAT_MODEL", "gpt-4o")
+        self.openai_compat_api_key: str = os.getenv("OPENAI_COMPAT_API_KEY", "not-needed")
+        self.openai_compat_timeout_s: int = int(os.getenv("OPENAI_COMPAT_TIMEOUT_S", 120))
+        # HuggingFace Serverless Inference settings
+        self.hf_model: str = os.getenv("HF_MODEL", "microsoft/Phi-3-mini-4k-instruct")
+        # Try env var first, then fall back to local cache token
+        self.hf_api_key: str = os.getenv("HF_API_KEY", "") or get_hf_token_from_cache()
+        # Gen Settings
+        self.max_output_questions: int = int(os.getenv("MAX_OUTPUT_QUESTIONS", 6))
+settings = Settings()
+# --- 3. Prompts ---
+SYSTEM_PROMPT = (
+    "You are a biomedical paper reading assistant. "
+    "Only use the provided text. Do not add external facts. "
+    "Every question MUST include an evidence_quote copied verbatim from the provided text."
+)
+def build_question_prompt(selected_text: str, context_text: str | None, section_title: str | None, page_start: int | None, page_end: int | None) -> str:
+    meta = []
+    if section_title:
+        meta.append(f"Section: {section_title}")
+    if page_start is not None:
+        meta.append(f"Pages: {page_start}-{page_end or page_start}")
+    meta_block = "\n".join(meta) if meta else "Section: Unknown"
+    context = (context_text or "").strip()
+    if not context:
+        context = selected_text.strip()
+    max_q = settings.max_output_questions
+    return f"""Task: Generate good questions from this paper excerpt.
+Excerpt metadata:
+{meta_block}
+Highlighted text:
+{selected_text.strip()}
+Context (use this for grounding; do not go beyond it):
+{context}
+Output STRICT JSON with this schema:
+{{
+  "questions": [
+    {{
+      "category": "Clinical|Mechanism|Evidence|Methods|Limitations|NextStep",
+      "question": "...",
+      "evidence_quote": "..."
+    }}
+  ]
+}}
+Rules:
+- Output {max_q} questions.
+- Questions must be specific and actionable.
+- evidence_quote MUST be a verbatim substring from the Context text.
+"""
+# --- 4. LLM Clients ---
+class LLMError(RuntimeError):
+    pass
+class BaseLLM:
+    def generate_json(self, system_prompt: str, user_prompt: str) -> str:
+        raise NotImplementedError
+class OllamaLLM(BaseLLM):
+    def __init__(self, cfg: Settings):
+        self.base_url = cfg.ollama_base_url.rstrip("/")
+        self.model = cfg.ollama_model
+        self.timeout = cfg.ollama_timeout_s
+    @retry(stop=stop_after_attempt(2), wait=wait_exponential(multiplier=0.5, min=0.5, max=2))
+    def generate_json(self, system_prompt: str, user_prompt: str) -> str:
+        url = f"{self.base_url}/api/generate"
+        payload = {
+            "model": self.model,
+            "prompt": user_prompt,
+            "system": system_prompt,
+            "format": "json",
+            "stream": False,
+            "options": {"temperature": 0.4, "top_p": 0.9, "num_predict": 700}
+        }
+        print(f"[DEBUG] Ollama request to {url} with model={self.model}")
+        try:
+            with httpx.Client(timeout=self.timeout) as client:
+                r = client.post(url, json=payload)
+                print(f"[DEBUG] Ollama response status: {r.status_code}")
+                if r.status_code != 200:
+                    print(f"[DEBUG] Ollama error response: {r.text}")
+                r.raise_for_status()
+                data = r.json()
+                return data.get("response", "").strip()
+        except httpx.TimeoutException as e:
+            print(f"[DEBUG] Ollama timeout: {e}")
+            raise LLMError(f"Ollama generate timed out after {self.timeout}s: {e}")
+        except Exception as e:
+            print(f"[DEBUG] Ollama exception type={type(e).__name__}: {e}")
+            raise LLMError(f"Ollama generate failed: {e}")
+class OpenAICompatLLM(BaseLLM):
+    def __init__(self, cfg: Settings):
+        self.base_url = cfg.openai_compat_base_url.rstrip("/")
+        self.model = cfg.openai_compat_model
+        self.api_key = cfg.openai_compat_api_key
+        self.timeout = cfg.openai_compat_timeout_s
+    @retry(stop=stop_after_attempt(2), wait=wait_exponential(multiplier=0.5, min=0.5, max=2))
+    def generate_json(self, system_prompt: str, user_prompt: str) -> str:
+        url = f"{self.base_url}/chat/completions"
+        headers = {"Content-Type": "application/json"}
+        if self.api_key and self.api_key != "not-needed":
+            headers["Authorization"] = f"Bearer {self.api_key}"
+        payload = {
+            "model": self.model,
+            "messages": [
+                {"role": "system", "content": system_prompt},
+                {"role": "user", "content": user_prompt}
+            ],
+            "temperature": 0.4,
+            "top_p": 0.9,
+            "max_tokens": 900,
+            "response_format": {"type": "json_object"}
+        }
+        try:
+            with httpx.Client(timeout=self.timeout) as client:
+                r = client.post(url, headers=headers, json=payload)
+                r.raise_for_status()
+                data = r.json()
+                return (data["choices"][0]["message"]["content"] or "").strip()
+        except Exception as e:
+            raise LLMError(f"OpenAI-compat generate failed: {e}")
+class HuggingFaceLLM(BaseLLM):
+    """HuggingFace LLM using router.huggingface.co (OpenAI-compatible API format)"""
+    def __init__(self, cfg: Settings):
+        self.model = cfg.hf_model
+        self.api_key = cfg.hf_api_key
+        self.timeout = 120
+    @retry(stop=stop_after_attempt(2), wait=wait_exponential(multiplier=0.5, min=0.5, max=2))
+    def generate_json(self, system_prompt: str, user_prompt: str) -> str:
+        # HuggingFace router with OpenAI-compatible format (hosted on HuggingFace)
+        url = "https://router.huggingface.co/v1/chat/completions"
+        headers = {
+            "Authorization": f"Bearer {self.api_key}",
+            "Content-Type": "application/json"
+        }
+        print(f"[DEBUG] HuggingFace request to model: {self.model}")
+        print(f"[DEBUG] API key present: {bool(self.api_key and self.api_key != 'your_huggingface_api_key_here')}")
+        # OpenAI-compatible chat format (works with HuggingFace models)
+        payload = {
+            "model": self.model,
+            "messages": [
+                {"role": "system", "content": system_prompt},
+                {"role": "user", "content": user_prompt}
+            ],
+            "max_tokens": 800,
+            "temperature": 0.4
+        }
+        try:
+            with httpx.Client(timeout=self.timeout) as client:
+                r = client.post(url, headers=headers, json=payload)
+                print(f"[DEBUG] HuggingFace response status: {r.status_code}")
+                if r.status_code != 200:
+                    print(f"[DEBUG] HuggingFace error response: {r.text}")
+                r.raise_for_status()
+                # OpenAI-compatible response format
+                data = r.json()
+                if "choices" in data and len(data["choices"]) > 0:
+                    return data["choices"][0]["message"]["content"].strip()
+                return ""
+        except Exception as e:
+            print(f"[DEBUG] HuggingFace exception: {type(e).__name__}: {e}")
+            raise LLMError(f"HuggingFace generate failed: {e}")
+def get_llm(cfg: Settings) -> BaseLLM:
+    provider = (cfg.llm_provider or "").lower().strip()
+    if provider == "ollama":
+        return OllamaLLM(cfg)
+    if provider == "openai_compat":
+        return OpenAICompatLLM(cfg)
+    if provider == "huggingface":
+        return HuggingFaceLLM(cfg)
+    raise ValueError(f"Unsupported LLM_PROVIDER: {provider}")
+# --- 5. Generation Logic ---
+_JSON_RE = re.compile(r"\{.*\}", re.DOTALL)
+def _safe_extract_json(text: str) -> dict | None:
+    if not text:
+        return None
+    text = text.strip()
+    try:
+        return json.loads(text)
+    except Exception:
+        pass
+    m = _JSON_RE.search(text)
+    if m:
+        try:
+            return json.loads(m.group(0))
+        except Exception:
+            return None
+    return None
+def generate_annotations(
+    selected_text: str,
+    context_text: str | None = None,
+    section_title: str | None = None,
+    page_start: int | None = None,
+    page_end: int | None = None,
+    config: Settings | None = None
+) -> List[Dict[str, Any]]:
+    """
+    Main entrypoint: Generate questions for selected text using LLM only.
+    Returns empty list if generation fails.
+    """
+    cfg = config or settings
+    # 1. Setup
+    llm = get_llm(cfg)
+    user_prompt = build_question_prompt(selected_text, context_text, section_title, page_start, page_end)
+    # 2. Generate
+    questions = []
+    try:
+        raw = llm.generate_json(SYSTEM_PROMPT, user_prompt)
+        parsed = _safe_extract_json(raw)
+        if parsed and isinstance(parsed, dict) and isinstance(parsed.get("questions"), list):
+            for q in parsed["questions"]:
+                try:
+                    # Validate using Pydantic
+                    item = GeneratedQuestion(**q).model_dump()
+                    questions.append(item)
+                except Exception:
+                    continue
+        # Limit to max questions
+        questions = questions[:cfg.max_output_questions]
+    except Exception as e:
+        print(f"LLM Generation failed: {e}")
+        # In 'only llm' mode, we do not fallback. We return empty or raise.
+        # Returning empty list to be safe.
+        return []
+    return questions
+# --- 6. CLI Test ---
+if __name__ == "__main__":
+    # Example usage
+    sample_text = "BRCA1 mutations significantly increase the risk of developing breast cancer."
+    sample_context = "In this study of 500 patients, we observed that BRCA1 mutations significantly increase the risk of developing breast cancer compared to controls."
+    print("Generating annotations...")
+    results = generate_annotations(
+        selected_text=sample_text,
+        context_text=sample_context,
+        section_title="Abstract"
+    )
+    print(json.dumps(results, indent=2))

hipaathesis.py CHANGED Viewed

@@ -122,7 +122,7 @@ except ImportError:
     OPENCV_AVAILABLE = False
 from questions import THESIS_QUESTIONS
-from pubtator_annotator import PubTatorAnnotator
 warnings.filterwarnings('ignore')
@@ -298,12 +298,13 @@ class SecureFileHandler:
 class HIPAACompliantThesisAnalyzer:
     """HIPAA-compliant version of the thesis analyzer"""
-    def __init__(self, user_id=None, password=None, session_timeout=30, model_name="t5-small"):
         self.user_id = user_id or getpass.getuser()
         self.session_timeout = session_timeout  # minutes
         self.session_start = datetime.now()
         self.last_activity = datetime.now()
         self.model_name = model_name
         # Map model names to their optimal tasks and parameters
         self.model_configs = {
@@ -332,6 +333,7 @@ class HIPAACompliantThesisAnalyzer:
         print(f"HIPAA-Compliant Thesis Analyzer initialized for user: {self.user_id}")
         print(f"Session timeout: {session_timeout} minutes")
         print(f"Encryption enabled: {'Yes' if password else 'No'}")
     def _initialize_analyzer(self):
         """Initialize the core analyzer components"""
@@ -1195,6 +1197,18 @@ class HIPAACompliantThesisAnalyzer:
         return answers
     def cleanup_session(self):
         """Clean up session data securely"""
         self.hipaa_logger.log_access(self.user_id, "SESSION_END", "THESIS_ANALYZER")
@@ -1270,28 +1284,6 @@ def get_answer(req: AnalyzeReq):
         print(f"Error in get_answer: {e}")
         return {"error": str(e)}
-@app.post('/get_annotations')
-def get_annotations(req: AnalyzeReq):
-    """Get PubTator annotations only"""
-    try:
-        analyzer = HIPAACompliantThesisAnalyzer(
-            user_id=req.userId,
-            password=req.password,
-            session_timeout=30,
-            model_name=req.model_name
-        )
-        report = analyzer.process_annotations_only(
-            pdf_path=req.storageKey,
-            output_file="hipaa_annotations_only"
-        )
-        analyzer.cleanup_session()
-        return report
-    except Exception as e:
-        print(f"Error in get_annotations: {e}")
-        return {"error": str(e)}
 @app.post('/upload_db')
 async def upload_db(upload_db: str = Form(...), pdf_file: UploadFile = File(...)):
     """Read PDF, extract text & images + OCR, and save content to database"""
@@ -1583,7 +1575,6 @@ def download_pdf_from_url(document_url: str, verify_ssl: Optional[bool] = None)
     return response.content
 @app.post('/extract_content')
 async def extract_content(req: ExtractFromUrlRequest):
     """
@@ -1638,7 +1629,8 @@ def analyze(req: AnalyzeReq):
             user_id=req.userId,
             password=req.password,
             session_timeout=30,
-            model_name=req.model_name
         )
         pdf_path = req.storageKey
@@ -1671,6 +1663,35 @@ def analyze(req: AnalyzeReq):
         print(f"Error: {e}")
         print("Ensure all requirements are installed and Tesseract is available.")
 #if __name__ == "__main__":
     print("""
 HIPAA-COMPLIANT THESIS ANALYZER

     OPENCV_AVAILABLE = False
 from questions import THESIS_QUESTIONS
+from biomed_annotator import generate_annotations
 warnings.filterwarnings('ignore')
 class HIPAACompliantThesisAnalyzer:
     """HIPAA-compliant version of the thesis analyzer"""
+    def __init__(self, user_id=None, password=None, session_timeout=30, model_name="t5-small", mode="analyze"):
         self.user_id = user_id or getpass.getuser()
         self.session_timeout = session_timeout  # minutes
         self.session_start = datetime.now()
         self.last_activity = datetime.now()
         self.model_name = model_name
+        self.mode = mode
         # Map model names to their optimal tasks and parameters
         self.model_configs = {
         print(f"HIPAA-Compliant Thesis Analyzer initialized for user: {self.user_id}")
         print(f"Session timeout: {session_timeout} minutes")
         print(f"Encryption enabled: {'Yes' if password else 'No'}")
+        print(f"Mode: {self.mode}")
     def _initialize_analyzer(self):
         """Initialize the core analyzer components"""
         return answers
+    def get_annotation(self, sample_text, sample_context):
+        """Generate annotations using biomed_annotator"""
+        try:
+            return generate_annotations(
+                selected_text=sample_text,
+                context_text=sample_context
+            )
+        except Exception as e:
+            print(f"Error in get_annotation: {e}")
+            self.hipaa_logger.log_access(self.user_id, "ANNOTATION_ERROR", "TEXT_SELECTION", success=False)
+            return []
     def cleanup_session(self):
         """Clean up session data securely"""
         self.hipaa_logger.log_access(self.user_id, "SESSION_END", "THESIS_ANALYZER")
         print(f"Error in get_answer: {e}")
         return {"error": str(e)}
 @app.post('/upload_db')
 async def upload_db(upload_db: str = Form(...), pdf_file: UploadFile = File(...)):
     """Read PDF, extract text & images + OCR, and save content to database"""
     return response.content
 @app.post('/extract_content')
 async def extract_content(req: ExtractFromUrlRequest):
     """
             user_id=req.userId,
             password=req.password,
             session_timeout=30,
+            model_name=req.model_name,
+            mode="analyze"
         )
         pdf_path = req.storageKey
         print(f"Error: {e}")
         print("Ensure all requirements are installed and Tesseract is available.")
+class AnnotationReq(BaseModel):
+    userId: Optional[str] = None
+    password: Optional[str] = None
+    sample_text: str
+    sample_context: Optional[str] = None
+@app.post('/get_annotations')
+def get_annotations_api(req: AnnotationReq):
+    """Get annotations for selected text"""
+    try:
+        analyzer = HIPAACompliantThesisAnalyzer(
+            user_id=req.userId,
+            password=req.password,
+            mode="annotations"
+        )
+        annotations = analyzer.get_annotation(
+            sample_text=req.sample_text,
+            sample_context=req.sample_context
+        )
+        analyzer.cleanup_session()
+        return annotations
+    except Exception as e:
+        print(f"Error in get_annotations: {e}")
+        return {"error": str(e)}
 #if __name__ == "__main__":
     print("""
 HIPAA-COMPLIANT THESIS ANALYZER

requirements.txt CHANGED Viewed

@@ -1,21 +1,23 @@
 cryptography==46.0.1
-fastapi==0.118.0
 PyMuPDF==1.22.5
 nltk==3.9.1
-numpy<2.3.0,>=2
 opencv_python==4.12.0.88
-Pillow==11.3.0
 pydantic==2.11.9
 PyPDF2==3.0.1
-pytesseract==0.3.13
 requests==2.31.0
-torch==2.8.0
-transformers==4.56.1
 urllib3==2.2.0
-uvicorn
-scikit-learn==1.4.2
-rank-bm25==0.2.2
-sentence-transformers==2.7.0
 pymupdf==1.24.9
-textstat==0.7.4
-psycopg2-binary==2.9.10

 cryptography==46.0.1
+fastapi==0.118.0
 PyMuPDF==1.22.5
 nltk==3.9.1
+numpy<2.3.0,>=2
 opencv_python==4.12.0.88
+Pillow==11.3.0
 pydantic==2.11.9
 PyPDF2==3.0.1
+pytesseract==0.3.13
 requests==2.31.0
+torch==2.8.0
+transformers==4.56.1
 urllib3==2.2.0
+uvicorn
+scikit-learn==1.4.2
+rank-bm25==0.2.2
+sentence-transformers==2.7.0
 pymupdf==1.24.9
+textstat==0.7.4
+psycopg2-binary==2.9.10
+httpx
+tenacity

test_annotations_api.py ADDED Viewed

	@@ -0,0 +1,20 @@

+import requests
+import json
+url = "http://localhost:8000/get_annotations"
+payload = {
+    "sample_text": "cancer",
+    "sample_context": "cancer is common in nowadays, its better to diagnosis in early stages. Recovery will be faster"
+}
+try:
+    print(f"Sending request to {url}...")
+    response = requests.post(url, json=payload, timeout=120)
+    print(f"Status Code: {response.status_code}")
+    print("Response Content:")
+    try:
+        print(json.dumps(response.json(), indent=2))
+    except:
+        print(response.text)
+except Exception as e:
+    print(f"Error: {e}")