Spaces:

Sameer-Handsome173
/

graph_rag

Sleeping

App Files Files Community

Sameer-Handsome173 commited on Dec 15, 2025

Commit

7641778

verified ·

1 Parent(s): 4c32a55

Update app.py

Browse files

Files changed (1) hide show

app.py +125 -73

app.py CHANGED Viewed

@@ -4,16 +4,17 @@
 from fastapi import FastAPI, UploadFile, File, HTTPException
 from fastapi.middleware.cors import CORSMiddleware
-from pydantic import BaseModel
 from langchain.chains import GraphCypherQAChain, LLMChain
 from langchain_community.graphs import Neo4jGraph
 from langchain_community.llms import HuggingFaceHub
 from langchain.text_splitter import RecursiveCharacterTextSplitter
-from langchain.prompts import PromptTemplate, ChatPromptTemplate
 from langchain.output_parsers import PydanticOutputParser
-from typing import List
 import os
 import json
 import uvicorn
 # ================================
@@ -47,6 +48,28 @@ llm = None
 qa_chain = None
 extraction_chain = None
 # ================================
 # Pydantic Models for API
 # ================================
@@ -65,27 +88,31 @@ class QueryResponse(BaseModel):
     cypher_query: str = None
 # ================================
-# Prompt Templates
 # ================================
-# 1. Entity Extraction Prompt Template (Simplified for Qwen)
-ENTITY_EXTRACTION_TEMPLATE = """Extract entities and relationships from the text below.
 TEXT:
 {text}
-Extract:
-1. ENTITIES: People, organizations, products, technologies, concepts
-2. RELATIONSHIPS: How entities connect (CREATED, WORKS_AT, USES, etc.)
-Output ONLY this JSON format (no other text):
-{{"entities": [{{"name": "FastAPI", "type": "Technology", "description": "web framework"}}], "relationships": [{{"source": "Person", "target": "FastAPI", "type": "CREATED"}}]}}
-JSON:"""
 entity_extraction_prompt = PromptTemplate(
     input_variables=["text"],
-    template=ENTITY_EXTRACTION_TEMPLATE
 )
 # 2. Cypher Generation Prompt Template
@@ -203,82 +230,107 @@ async def startup_event():
 # ================================
 def extract_entities_relationships(text_chunk):
-    """Extract entities and relationships using LangChain prompt template"""
     try:
-        # Use the extraction chain
-        response = extraction_chain.run(text=text_chunk)
         print(f"\n{'='*60}")
-        print("RAW LLM RESPONSE:")
-        print(response)
-        print('='*60)
-        # Clean response
-        response = response.strip()
-        # Remove markdown code blocks if present
-        if "```json" in response:
-            response = response.split("```json")[1].split("```")[0]
-        elif "```" in response:
-            response = response.split("```")[1].split("```")[0]
-        response = response.strip()
-        # Find JSON object
-        if "{" in response and "}" in response:
-            start = response.find("{")
-            end = response.rfind("}") + 1
-            response = response[start:end]
-        print(f"CLEANED JSON:")
-        print(response)
         print('='*60)
-        data = json.loads(response)
-        print(f"PARSED DATA:")
-        print(f"Entities: {len(data.get('entities', []))}")
-        print(f"Relationships: {len(data.get('relationships', []))}")
-        return data
-    except json.JSONDecodeError as e:
-        print(f"❌ JSON parsing error: {e}")
-        print(f"Response was: {response[:500]}")
-        # Fallback: Try to extract at least some basic entities
-        return fallback_extraction(text_chunk)
     except Exception as e:
-        print(f"❌ Extraction error: {e}")
-        return {"entities": [], "relationships": []}
 def fallback_extraction(text):
-    """Simple fallback extraction using basic NLP"""
     print("⚠️ Using fallback extraction...")
-    # Simple entity extraction - find capitalized words
-    import re
-    words = text.split()
     entities = []
-    seen = set()
-    for i, word in enumerate(words):
-        # Find capitalized words (potential entities)
-        if word[0].isupper() and len(word) > 2:
-            clean_word = re.sub(r'[^\w\s]', '', word)
-            if clean_word and clean_word not in seen:
-                entities.append({
-                    "name": clean_word,
-                    "type": "Concept",
-                    "description": f"Extracted from: {' '.join(words[max(0,i-3):min(len(words),i+4)])}"
                 })
-                seen.add(clean_word)
-    print(f"Fallback extracted {len(entities)} entities")
-    return {"entities": entities[:20], "relationships": []}
 def add_to_graph(entities, relationships, doc_name):
     """Add entities and relationships to Neo4j with proper sanitization"""

 from fastapi import FastAPI, UploadFile, File, HTTPException
 from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel, Field
 from langchain.chains import GraphCypherQAChain, LLMChain
 from langchain_community.graphs import Neo4jGraph
 from langchain_community.llms import HuggingFaceHub
 from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain.prompts import PromptTemplate
 from langchain.output_parsers import PydanticOutputParser
+from typing import List, Optional
 import os
 import json
+import re
 import uvicorn
 # ================================
 qa_chain = None
 extraction_chain = None
+# ================================
+# Pydantic Models for Extraction
+# ================================
+class Entity(BaseModel):
+    """Single entity extracted from text"""
+    name: str = Field(description="The name of the entity")
+    type: str = Field(description="Type: Person, Organization, Product, Technology, Concept, Location")
+    description: Optional[str] = Field(default="", description="Brief description of the entity")
+class Relationship(BaseModel):
+    """Relationship between two entities"""
+    source: str = Field(description="Source entity name")
+    target: str = Field(description="Target entity name")
+    type: str = Field(description="Relationship type in UPPER_SNAKE_CASE (e.g., CREATED, FOUNDED, USES)")
+    context: Optional[str] = Field(default="", description="Context of the relationship")
+class ExtractionResult(BaseModel):
+    """Complete extraction result"""
+    entities: List[Entity] = Field(description="List of extracted entities")
+    relationships: List[Relationship] = Field(description="List of extracted relationships")
 # ================================
 # Pydantic Models for API
 # ================================
     cypher_query: str = None
 # ================================
+# Prompt Templates with Pydantic Parser
 # ================================
+# Create parser for structured output
+extraction_parser = PydanticOutputParser(pydantic_object=ExtractionResult)
+# 1. Entity Extraction Prompt with Pydantic
+ENTITY_EXTRACTION_TEMPLATE = """Extract entities and relationships from the text.
+{format_instructions}
 TEXT:
 {text}
+Important:
+- Extract people, organizations, products, technologies, concepts
+- For relationships use: CREATED, FOUNDED, USES, BUILT_ON, WORKS_AT, CEO_OF, INTEGRATES_WITH
+- Be specific and accurate
+Your response:"""
 entity_extraction_prompt = PromptTemplate(
     input_variables=["text"],
+    template=ENTITY_EXTRACTION_TEMPLATE,
+    partial_variables={"format_instructions": extraction_parser.get_format_instructions()}
 )
 # 2. Cypher Generation Prompt Template
 # ================================
 def extract_entities_relationships(text_chunk):
+    """Extract entities and relationships using Pydantic structured output"""
     try:
         print(f"\n{'='*60}")
+        print(f"Processing chunk: {text_chunk[:100]}...")
+        # Use the extraction chain
+        response = extraction_chain.run(text=text_chunk)
+        print(f"RAW LLM RESPONSE:")
+        print(response[:500])
         print('='*60)
+        # Try to parse with Pydantic parser
+        try:
+            result = extraction_parser.parse(response)
+            entities = [e.dict() for e in result.entities]
+            relationships = [r.dict() for r in result.relationships]
+            print(f"✅ PARSED with Pydantic:")
+            print(f"   Entities: {len(entities)}")
+            print(f"   Relationships: {len(relationships)}")
+            return {"entities": entities, "relationships": relationships}
+        except Exception as parse_error:
+            print(f"⚠️ Pydantic parsing failed: {parse_error}")
+            print("Trying manual JSON extraction...")
+            # Fallback: Try manual JSON extraction
+            cleaned = response.strip()
+            # Remove markdown
+            if "```json" in cleaned:
+                cleaned = cleaned.split("```json")[1].split("```")[0]
+            elif "```" in cleaned:
+                cleaned = cleaned.split("```")[1].split("```")[0]
+            # Find JSON
+            if "{" in cleaned and "}" in cleaned:
+                start = cleaned.find("{")
+                end = cleaned.rfind("}") + 1
+                cleaned = cleaned[start:end]
+            data = json.loads(cleaned)
+            print(f"✅ Manual JSON parse successful: {len(data.get('entities', []))} entities")
+            return data
     except Exception as e:
+        print(f"❌ All parsing failed: {e}")
+        print("Using fallback extraction...")
+        return fallback_extraction(text_chunk)
 def fallback_extraction(text):
+    """Simple rule-based fallback extraction"""
     print("⚠️ Using fallback extraction...")
     entities = []
+    relationships = []
+    seen_entities = set()
+    # Split into sentences
+    sentences = [s.strip() for s in text.split('.') if s.strip()]
+    for sentence in sentences:
+        words = sentence.split()
+        # Extract capitalized words/phrases as entities
+        current_entity = []
+        for word in words:
+            clean = re.sub(r'[^\w\s]', '', word)
+            if clean and clean[0].isupper() and len(clean) > 2:
+                current_entity.append(clean)
+            elif current_entity:
+                entity_name = ' '.join(current_entity)
+                if entity_name not in seen_entities:
+                    entities.append({
+                        "name": entity_name,
+                        "type": "Concept",
+                        "description": sentence[:100]
+                    })
+                    seen_entities.add(entity_name)
+                current_entity = []
+        # Check for common relationship patterns
+        if ' created ' in sentence.lower() or ' developed ' in sentence.lower():
+            # Try to extract creator and creation
+            parts = re.split(r' created | developed ', sentence, flags=re.IGNORECASE)
+            if len(parts) == 2:
+                creator = parts[0].strip().split()[-1]
+                creation = parts[1].strip().split()[0]
+                relationships.append({
+                    "source": creator,
+                    "target": creation,
+                    "type": "CREATED",
+                    "context": sentence[:100]
                 })
+    print(f"Fallback extracted: {len(entities)} entities, {len(relationships)} relationships")
+    return {"entities": entities[:15], "relationships": relationships[:10]}
 def add_to_graph(entities, relationships, doc_name):
     """Add entities and relationships to Neo4j with proper sanitization"""