yuvrajsingh6 commited on
Commit
9c4c212
·
0 Parent(s):

deploy: v2 production baked index (zero latency)

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .dockerignore +56 -0
  2. .env.example +6 -0
  3. .github/workflows/deploy_to_hf.yml +20 -0
  4. .github/workflows/docker-build.yml +34 -0
  5. .gitignore +23 -0
  6. AWS_APP_RUNNER_SETUP.md +48 -0
  7. AWS_DEPLOYMENT.md +152 -0
  8. DEPLOYMENT.md +178 -0
  9. Dockerfile +27 -0
  10. Makefile +40 -0
  11. Procfile +1 -0
  12. README.md +170 -0
  13. app.py +173 -0
  14. configs/default.yaml +22 -0
  15. data/raw/finphrase_000.txt +3 -0
  16. data/raw/finphrase_001.txt +3 -0
  17. data/raw/finphrase_002.txt +3 -0
  18. data/raw/finphrase_003.txt +3 -0
  19. data/raw/finphrase_004.txt +3 -0
  20. data/raw/finphrase_005.txt +3 -0
  21. data/raw/finphrase_006.txt +3 -0
  22. data/raw/finphrase_007.txt +3 -0
  23. data/raw/finphrase_008.txt +3 -0
  24. data/raw/finphrase_009.txt +3 -0
  25. data/raw/finphrase_010.txt +3 -0
  26. data/raw/finphrase_011.txt +3 -0
  27. data/raw/finphrase_012.txt +3 -0
  28. data/raw/finphrase_013.txt +3 -0
  29. data/raw/finphrase_014.txt +3 -0
  30. data/raw/finphrase_015.txt +3 -0
  31. data/raw/finphrase_016.txt +3 -0
  32. data/raw/finphrase_017.txt +3 -0
  33. data/raw/finphrase_018.txt +3 -0
  34. data/raw/finphrase_019.txt +3 -0
  35. data/raw/finphrase_020.txt +3 -0
  36. data/raw/finphrase_021.txt +3 -0
  37. data/raw/finphrase_022.txt +3 -0
  38. data/raw/finphrase_023.txt +3 -0
  39. data/raw/finphrase_024.txt +3 -0
  40. data/raw/finphrase_025.txt +3 -0
  41. data/raw/finphrase_026.txt +3 -0
  42. data/raw/finphrase_027.txt +3 -0
  43. data/raw/finphrase_028.txt +3 -0
  44. data/raw/finphrase_029.txt +3 -0
  45. data/raw/finphrase_030.txt +3 -0
  46. data/raw/finphrase_031.txt +3 -0
  47. data/raw/finphrase_032.txt +3 -0
  48. data/raw/finphrase_033.txt +3 -0
  49. data/raw/finphrase_034.txt +3 -0
  50. data/raw/finphrase_035.txt +3 -0
.dockerignore ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python
2
+ __pycache__/
3
+ *.pyc
4
+ *.pyo
5
+ *.pyd
6
+ .Python
7
+
8
+ # Virtual Environment
9
+ venv/
10
+ env/
11
+ ENV/
12
+ .venv
13
+
14
+ # Environment Variables
15
+ .env
16
+ .env.local
17
+
18
+ # Data directories
19
+ data/raw/
20
+ # data/index/ <-- We want to include this for the showcase
21
+ data/db/
22
+ data/processed/
23
+
24
+ # IDE
25
+ .vscode/
26
+ .idea/
27
+ *.swp
28
+ *.swo
29
+ *~
30
+
31
+ # OS
32
+ .DS_Store
33
+ Thumbs.db
34
+
35
+ # Git
36
+ .git/
37
+ .gitignore
38
+
39
+ # Testing
40
+ .pytest_cache/
41
+ .coverage
42
+ htmlcov/
43
+
44
+ # Documentation (not needed in container)
45
+ docs/
46
+ *.md
47
+ !README.md
48
+
49
+ # Notebooks
50
+ notebooks/
51
+ *.ipynb
52
+
53
+ # Build artifacts
54
+ dist/
55
+ build/
56
+ *.egg-info/
.env.example ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ OPENAI_API_KEY=sk-xxx
2
+ VLLM_API_URL=http://localhost:8000/v1
3
+ # VLLM_MODEL=mistralai/Mistral-7B-Instruct-v0.2
4
+ LOG_LEVEL=INFO
5
+ DATA_DIR=data
6
+ INDEX_DIR=data/index
.github/workflows/deploy_to_hf.yml ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Sync to Hugging Face Hub
2
+
3
+ on:
4
+ push:
5
+ branches: [main]
6
+
7
+ jobs:
8
+ sync-to-hub:
9
+ runs-on: ubuntu-latest
10
+ steps:
11
+ - uses: actions/checkout@v3
12
+ with:
13
+ fetch-depth: 0
14
+ lfs: true
15
+ - name: Push to hub
16
+ env:
17
+ HF_TOKEN: ${{ secrets.HF_TOKEN }}
18
+ run: |
19
+ git remote add hf https://yuvis:$HF_TOKEN@huggingface.co/spaces/yuvis/Enterprise-RAG-System
20
+ git push -f hf main
.github/workflows/docker-build.yml ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Build and Push Docker Image
2
+
3
+ on:
4
+ push:
5
+ branches: [ "main", "master" ]
6
+ workflow_dispatch:
7
+
8
+ jobs:
9
+ build-and-push:
10
+ runs-on: ubuntu-latest
11
+ steps:
12
+ - name: Checkout repository
13
+ uses: actions/checkout@v4
14
+
15
+ - name: Set up QEMU
16
+ uses: docker/setup-qemu-action@v3
17
+
18
+ - name: Set up Docker Buildx
19
+ uses: docker/setup-buildx-action@v3
20
+
21
+ - name: Login to Docker Hub
22
+ uses: docker/login-action@v3
23
+ with:
24
+ username: ${{ secrets.DOCKERHUB_USERNAME }}
25
+ password: ${{ secrets.DOCKERHUB_TOKEN }}
26
+
27
+ - name: Build and push
28
+ uses: docker/build-push-action@v5
29
+ with:
30
+ context: .
31
+ push: true
32
+ tags: ${{ secrets.DOCKERHUB_USERNAME }}/enterprise-rag:latest
33
+ cache-from: type=gha
34
+ cache-to: type=gha,mode=max
.gitignore ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python
2
+ __pycache__/
3
+ *.pyc
4
+
5
+ # Virtual Env
6
+ venv/
7
+ env/
8
+
9
+ # Environment Variables
10
+ .env
11
+
12
+ # System
13
+ .DS_Store
14
+
15
+ # Data (Generated/Downloaded)
16
+ data/index/
17
+ data/raw/
18
+ # data/db/
19
+ # data/processed/
20
+
21
+ # IDE
22
+ .vscode/
23
+ .idea/
AWS_APP_RUNNER_SETUP.md ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # AWS App Runner Deployment Guide
2
+
3
+ Follow these steps to deploy your Enterprise RAG System to AWS App Runner for a recruiter-ready showcase.
4
+
5
+ ## 1. Local Verification
6
+ First, build and run your image locally to ensure the index is properly packaged:
7
+ ```bash
8
+ docker build -t enterprise-rag .
9
+ docker run -p 8501:8501 -e GROQ_API_KEY=your_key_here enterprise-rag
10
+ ```
11
+ Visit `http://localhost:8501` to verify.
12
+
13
+ ## 2. Push to AWS ECR
14
+ You need to push your image to the Amazon Elastic Container Registry.
15
+
16
+ 1. **Create Repository**:
17
+ ```bash
18
+ aws ecr create-repository --repository-name enterprise-rag --region your-region
19
+ ```
20
+ 2. **Login to ECR**:
21
+ ```bash
22
+ aws ecr get-login-password --region your-region | docker login --username AWS --password-stdin <your-account-id>.dkr.ecr.<your-region>.amazonaws.com
23
+ ```
24
+ 3. **Tag & Push**:
25
+ ```bash
26
+ docker tag enterprise-rag:latest <your-account-id>.dkr.ecr.<your-region>.amazonaws.com/enterprise-rag:latest
27
+ docker push <your-account-id>.dkr.ecr.<your-region>.amazonaws.com/enterprise-rag:latest
28
+ ```
29
+
30
+ ## 3. Create App Runner Service
31
+ 1. Go to **AWS Console** → **App Runner**.
32
+ 2. Click **Create service**.
33
+ 3. **Source**:
34
+ - Repository type: **Container registry**.
35
+ - Provider: **Amazon ECR**.
36
+ - Container image: Select your `enterprise-rag` image.
37
+ - Deployment settings: **Manual** (or Automatic if you want CI/CD).
38
+ 4. **Configuration**:
39
+ - Service name: `enterprise-rag-showcase`.
40
+ - Virtual CPU & Memory: **1 vCPU & 2 GB** (Minimum recommended).
41
+ - **Environment variables**:
42
+ - `GROQ_API_KEY`: Paste your key here.
43
+ 5. **Connectivity**:
44
+ - Port: **8501**.
45
+ 6. **Review & Create**.
46
+
47
+ ## 4. Final Result
48
+ Once deployed, AWS will provide a public URL like `https://xxxxxx.us-east-1.awsapprunner.com`. This is the URL you can share with recruiters!
AWS_DEPLOYMENT.md ADDED
@@ -0,0 +1,152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # AWS Deployment Guide - Enterprise RAG System
2
+
3
+ ## Prerequisites
4
+ - AWS Account
5
+ - AWS CLI installed and configured
6
+ - Docker installed locally
7
+
8
+ ## Deployment Options
9
+
10
+ ### Option 1: AWS EC2 (Recommended for Full Control)
11
+
12
+ #### Step 1: Launch EC2 Instance
13
+ 1. Go to AWS Console → EC2
14
+ 2. Click **"Launch Instance"**
15
+ 3. Choose:
16
+ - **AMI**: Ubuntu 22.04 LTS
17
+ - **Instance Type**: t3.medium (4GB RAM minimum)
18
+ - **Storage**: 20GB
19
+ 4. Configure Security Group:
20
+ - Allow SSH (port 22) from your IP
21
+ - Allow HTTP (port 8501) from anywhere
22
+ - Allow HTTP (port 8000) from anywhere
23
+
24
+ #### Step 2: Connect to Instance
25
+ ```bash
26
+ ssh -i your-key.pem ubuntu@your-ec2-public-ip
27
+ ```
28
+
29
+ #### Step 3: Install Docker
30
+ ```bash
31
+ # Update system
32
+ sudo apt update && sudo apt upgrade -y
33
+
34
+ # Install Docker
35
+ curl -fsSL https://get.docker.com -o get-docker.sh
36
+ sudo sh get-docker.sh
37
+ sudo usermod -aG docker ubuntu
38
+
39
+ # Install Docker Compose
40
+ sudo apt install docker-compose-plugin
41
+ ```
42
+
43
+ #### Step 4: Clone Repository
44
+ ```bash
45
+ git clone https://github.com/YuvrajSinghBhadoria2/Enterprise-RAG-System.git
46
+ cd Enterprise-RAG-System
47
+ ```
48
+
49
+ #### Step 5: Configure Environment
50
+ ```bash
51
+ # Create .env file
52
+ cat > .env << EOF
53
+ GROQ_API_KEY=your_groq_api_key_here
54
+ EOF
55
+ ```
56
+
57
+ #### Step 6: Build and Run
58
+ ```bash
59
+ # Using Docker Compose
60
+ docker compose -f docker/docker-compose.yml up -d --build
61
+
62
+ # Generate data (one-time)
63
+ docker compose -f docker/docker-compose.yml exec api python3 tools/generate-dataset.py
64
+ docker compose -f docker/docker-compose.yml exec api python3 src/ingestion/ingest.py
65
+ ```
66
+
67
+ #### Step 7: Access Application
68
+ - **UI**: `http://your-ec2-public-ip:8501`
69
+ - **API**: `http://your-ec2-public-ip:8000/docs`
70
+
71
+ ---
72
+
73
+ ### Option 2: AWS ECS (Fargate) - Serverless
74
+
75
+ #### Step 1: Push Docker Image to ECR
76
+ ```bash
77
+ # Create ECR repository
78
+ aws ecr create-repository --repository-name enterprise-rag
79
+
80
+ # Login to ECR
81
+ aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin YOUR_ACCOUNT_ID.dkr.ecr.us-east-1.amazonaws.com
82
+
83
+ # Build and push
84
+ docker build -f docker/Dockerfile.api -t enterprise-rag .
85
+ docker tag enterprise-rag:latest YOUR_ACCOUNT_ID.dkr.ecr.us-east-1.amazonaws.com/enterprise-rag:latest
86
+ docker push YOUR_ACCOUNT_ID.dkr.ecr.us-east-1.amazonaws.com/enterprise-rag:latest
87
+ ```
88
+
89
+ #### Step 2: Create ECS Task Definition
90
+ 1. Go to ECS Console
91
+ 2. Create new Task Definition (Fargate)
92
+ 3. Add container:
93
+ - Image: Your ECR image URI
94
+ - Memory: 4GB
95
+ - Port: 8501
96
+ 4. Add environment variable: `GROQ_API_KEY`
97
+
98
+ #### Step 3: Create ECS Service
99
+ 1. Create ECS Cluster
100
+ 2. Create Service from Task Definition
101
+ 3. Configure Load Balancer (optional)
102
+
103
+ ---
104
+
105
+ ### Option 3: AWS Lightsail (Simplest)
106
+
107
+ #### Step 1: Create Lightsail Instance
108
+ 1. Go to Lightsail Console
109
+ 2. Create Instance:
110
+ - Platform: Linux/Unix
111
+ - Blueprint: Ubuntu 22.04
112
+ - Plan: $10/month (2GB RAM)
113
+
114
+ #### Step 2: Deploy
115
+ Same as EC2 steps 2-7 above
116
+
117
+ ---
118
+
119
+ ## Cost Estimates
120
+
121
+ | Service | Cost/Month | Best For |
122
+ |---------|-----------|----------|
123
+ | EC2 t3.medium | ~$30 | Full control, testing |
124
+ | ECS Fargate | ~$40 | Production, auto-scaling |
125
+ | Lightsail | $10-20 | Simple deployment |
126
+
127
+ ## Recommended: EC2 t3.medium
128
+
129
+ For your use case, I recommend **EC2 t3.medium** because:
130
+ - ✅ Full control
131
+ - ✅ Easy to manage
132
+ - ✅ Cost-effective
133
+ - ✅ Can run Docker Compose easily
134
+
135
+ ## Maintenance
136
+
137
+ **Update code:**
138
+ ```bash
139
+ cd Enterprise-RAG-System
140
+ git pull
141
+ docker compose -f docker/docker-compose.yml up -d --build
142
+ ```
143
+
144
+ **View logs:**
145
+ ```bash
146
+ docker compose -f docker/docker-compose.yml logs -f
147
+ ```
148
+
149
+ **Restart services:**
150
+ ```bash
151
+ docker compose -f docker/docker-compose.yml restart
152
+ ```
DEPLOYMENT.md ADDED
@@ -0,0 +1,178 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Deployment Guide
2
+
3
+ This guide ensures a smooth deployment of the Enterprise RAG system to any cloud VPS (Virtual Private Server) such as AWS EC2, DigitalOcean Droplet, Google Compute Engine, or Azure VM.
4
+
5
+ ## 🚀 Prerequisites
6
+
7
+ * **Server**: A Linux server (Ubuntu 22.04 LTS recommended).
8
+ * **Specs**: Minimum 4GB RAM (8GB recommended for embeddings/FAISS), 2 vCPUs.
9
+ * **Software**: Docker and Docker Compose installed.
10
+
11
+ ## � Recommended Providers
12
+
13
+ **Railway.app** (Easiest PaaS):
14
+ * Perfect for quick demos.
15
+ * Supports our `Dockerfile` setup out of the box.
16
+
17
+ ## 🚂 Railway.app Deployment (Complete Setup)
18
+
19
+ Railway requires deploying **two separate services** from the same repository.
20
+
21
+ ### Prerequisites
22
+ * GitHub repository: `https://github.com/YuvrajSinghBhadoria2/Enterprise-RAG-System.git`
23
+ * Railway account: [railway.app](https://railway.app)
24
+ * Groq API Key
25
+
26
+ ### Step 1: Deploy API Service
27
+
28
+ 1. **Create New Project** in Railway
29
+ 2. **Deploy from GitHub** → Select your repository
30
+ 3. Railway will auto-detect the `railway.toml` and use the Dockerfile
31
+ 4. **Add Environment Variables**:
32
+ ```
33
+ GROQ_API_KEY=gsk_your_key_here
34
+ PORT=8000
35
+ ```
36
+ 5. **Deploy** - Railway will build using `docker/Dockerfile.api`
37
+ 6. **Get API URL** - Copy the public URL (e.g., `https://enterprise-rag-production.up.railway.app`)
38
+
39
+ ### Step 2: Deploy UI Service
40
+
41
+ 1. In the **same Railway project**, click **+ New Service**
42
+ 2. **Deploy from GitHub** → Select the **same repository**
43
+ 3. **Configure Build**:
44
+ - Go to **Settings** → **Build**
45
+ - Set **Dockerfile Path**: `docker/Dockerfile.streamlit`
46
+ 4. **Add Environment Variable**:
47
+ ```
48
+ API_URL=https://your-api-url-from-step1.up.railway.app/api/v1/chat
49
+ ```
50
+ *(Replace with your actual API URL from Step 1)*
51
+ 5. **Deploy**
52
+
53
+ ### Step 3: Generate Data (Critical!)
54
+
55
+ Railway containers are ephemeral, so you need to generate data on startup:
56
+
57
+ **Option A: One-time manual generation** (for testing):
58
+ ```bash
59
+ # In Railway API service shell
60
+ python3 tools/generate-dataset.py
61
+ python3 src/ingestion/ingest.py
62
+ ```
63
+
64
+ **Option B: Auto-generate on startup** (recommended):
65
+ Update the `railway.toml` start command to include data generation.
66
+
67
+ ### Step 4: Access Your Application
68
+
69
+ - **UI**: `https://your-ui-service.up.railway.app`
70
+ - **API Docs**: `https://your-api-service.up.railway.app/docs`
71
+
72
+ ### Troubleshooting
73
+
74
+ **Build Timeout?**
75
+ - Ensure `.dockerignore` excludes `venv/` and `data/`
76
+ - Check that `railway.toml` points to the correct Dockerfile
77
+
78
+ **UI Can't Connect to API?**
79
+ - Verify `API_URL` environment variable in UI service
80
+ - Ensure API service is deployed and running
81
+ - Check API URL includes `/api/v1/chat` endpoint
82
+
83
+ ---
84
+
85
+ ## 📦 Step-by-Step Deployment (VPS)
86
+
87
+ ### 1. Provision & Access Server
88
+ SSH into your server:
89
+ ```bash
90
+ ssh user@your-server-ip
91
+ ```
92
+
93
+ ### 2. Install Docker (If not installed)
94
+ ```bash
95
+ # Update packages
96
+ sudo apt update && sudo apt upgrade -y
97
+
98
+ # Install Docker
99
+ curl -fsSL https://get.docker.com -o get-docker.sh
100
+ sudo sh get-docker.sh
101
+
102
+ # Install Docker Compose Plugin
103
+ sudo apt install docker-compose-plugin
104
+ ```
105
+
106
+ ### 3. Clone the Repository
107
+ ```bash
108
+ git clone https://github.com/your-repo/enterprise-rag.git
109
+ cd enterprise-rag
110
+ ```
111
+
112
+ ### 4. Configure Environment
113
+ Create the production `.env` file:
114
+ ```bash
115
+ cp .env.example .env
116
+ nano .env
117
+ ```
118
+ *Paste your `GROQ_API_KEY` or `OPENAI_API_KEY` into the file.*
119
+
120
+ ### 5. Build and Start Services
121
+ This command will build the images and start the API and UI in the background with auto-restart enabled.
122
+ ```bash
123
+ # Using the Makefile shortcut
124
+ make up
125
+
126
+ # OR manually using docker compose
127
+ docker compose -f docker/docker-compose.yml up -d --build
128
+ ```
129
+
130
+ ### 6. Generate Data (Critical Step)
131
+ Fresh deployments start empty. You must ingest the datasets to make the search work.
132
+ ```bash
133
+ # Run ingestion inside the running API container
134
+ docker compose -f docker/docker-compose.yml exec api python3 tools/generate-dataset.py
135
+ docker compose -f docker/docker-compose.yml exec api python3 src/ingestion/ingest.py
136
+ ```
137
+
138
+ ### 7. Access the Application
139
+ * **UI**: `http://your-server-ip:8501`
140
+ * **API**: `http://your-server-ip:8000/docs`
141
+
142
+ ---
143
+
144
+ ## 🔒 Production Hardening
145
+
146
+ ### 1. Firewall (UFW)
147
+ Only open necessary ports.
148
+ ```bash
149
+ sudo ufw allow 22/tcp
150
+ sudo ufw allow 8501/tcp # Streamlit
151
+ sudo ufw allow 8000/tcp # API
152
+ sudo ufw enable
153
+ ```
154
+
155
+ ### 2. Reverse Proxy (Nginx + SSL)
156
+ For HTTPS, use Nginx as a reverse proxy.
157
+ ```nginx
158
+ server {
159
+ listen 80;
160
+ server_name rag.yourdomain.com;
161
+
162
+ location / {
163
+ proxy_pass http://localhost:8501;
164
+ proxy_set_header Host $host;
165
+ proxy_set_header X-Real-IP $remote_addr;
166
+ }
167
+ }
168
+ ```
169
+
170
+ ## 🛠️ Maintenance
171
+
172
+ * **View Logs**: `make logs`
173
+ * **Restart Services**: `make down && make up`
174
+ * **Update Code**:
175
+ ```bash
176
+ git pull
177
+ make up
178
+ ```
Dockerfile ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11-slim
2
+
3
+ WORKDIR /app
4
+
5
+ # Install system dependencies
6
+ RUN apt-get update && apt-get install -y \
7
+ build-essential \
8
+ && rm -rf /var/lib/apt/lists/*
9
+
10
+ # Copy requirements and install Python dependencies
11
+ COPY requirements.txt .
12
+ RUN pip install --no-cache-dir -r requirements.txt
13
+
14
+ # Copy application code
15
+ COPY . .
16
+
17
+ # Download index during build (Bake into image)
18
+ RUN python tools/download_index.py
19
+
20
+ # Set Python path
21
+ ENV PYTHONPATH=/app
22
+
23
+ # Expose Streamlit port
24
+ EXPOSE 8501
25
+
26
+ # Run Streamlit
27
+ CMD ["streamlit", "run", "app.py", "--server.port=8501", "--server.address=0.0.0.0"]
Makefile ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .PHONY: build up down logs ingest eval run-local
2
+
3
+ # Docker commands
4
+ build:
5
+ docker-compose -f docker/docker-compose.yml --env-file .env build
6
+
7
+ up:
8
+ docker-compose -f docker/docker-compose.yml --env-file .env up -d --build
9
+
10
+ down:
11
+ docker-compose -f docker/docker-compose.yml --env-file .env down
12
+
13
+ logs:
14
+ docker-compose -f docker/docker-compose.yml --env-file .env logs -f
15
+
16
+ api-shell:
17
+ docker-compose -f docker/docker-compose.yml --env-file .env exec api /bin/bash
18
+
19
+ # Run evaluation inside Docker
20
+ eval:
21
+ docker-compose -f docker/docker-compose.yml --env-file .env exec api python3 tools/run_eval.py
22
+
23
+ # Run evaluation locally (Mac fallback)
24
+ eval-local:
25
+ export DISABLE_FAISS=1 && export KMP_DUPLICATE_LIB_OK=TRUE && export GROQ_API_KEY=${GROQ_API_KEY} && python3 tools/run_eval.py
26
+
27
+ # Ingestion (runs locally if venv active, or use via docker exec)
28
+ ingest:
29
+ export PYTHONPATH=$$PYTHONPATH:. && python3 src/ingestion/ingest.py
30
+
31
+ # Data generation
32
+ generate-data:
33
+ python3 tools/generate-dataset.py
34
+
35
+ # Run API and UI locally (Mac fallback)
36
+ run-local:
37
+ @echo "Starting Enterprise RAG Locally (Safe Mode)..."
38
+ @export DISABLE_FAISS=1 && export KMP_DUPLICATE_LIB_OK=TRUE && export GROQ_API_KEY=${GROQ_API_KEY} && \
39
+ (uvicorn src.app.main:app --host 0.0.0.0 --port 8000 &) && \
40
+ (sleep 5 && streamlit run src/ui/app.py --server.port 8501)
Procfile ADDED
@@ -0,0 +1 @@
 
 
1
+ web: uvicorn src.app.main:app --host 0.0.0.0 --port $PORT
README.md ADDED
@@ -0,0 +1,170 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Enterprise RAG System
3
+ emoji: 🚀
4
+ colorFrom: blue
5
+ colorTo: indigo
6
+ sdk: docker
7
+ app_port: 8501
8
+ pinned: false
9
+ ---
10
+
11
+ # Enterprise RAG System
12
+
13
+ An enterprise-grade Retrieval-Augmented Generation (RAG) system designed for high accuracy, safety, and scalability. This project demonstrates a production-ready pipeline with hybrid search, reranking, and strict guardrails against hallucinations.
14
+
15
+ ## 🚀 Key Features
16
+
17
+ * **Hybrid Retrieval**: Combines **BM25** (Keyword) and **FAISS** (Dense Vector) search for optimal recall.
18
+ * **Context Reranking**: Utilizes `cross-encoder/ms-marco-MiniLM-L-6-v2` to precision-rank documents before generation.
19
+ * **Enterprise Guardrails**:
20
+ * **Refusal Logic**: Strictly refuses to answer if context is insufficient.
21
+ * **Halucination Detection**: Automated grading of Answer Relevancy and Groundedness.
22
+ * **Confidence Gating**: Blocks generation if retrieval scores are below a safety threshold.
23
+ * **Multi-Provider LLM**: Supports **Groq** (Llama-3), **vLLM**, and **OpenAI**.
24
+ * **Modern Stack**: Built with **FastAPI**, **Streamlit**, and **Docker**.
25
+
26
+ ---
27
+
28
+ ## 🛠️ Quick Start
29
+
30
+ ### Prerequisites
31
+ * Docker Desktop (Recommended)
32
+ * Python 3.10+ (For local run)
33
+ * Groq API Key (or OpenAI/vLLM)
34
+
35
+ ### 1. Configuration
36
+ Create a `.env` file in the root directory:
37
+ ```bash
38
+ cp .env.example .env
39
+ ```
40
+ Edit `.env` and add your API key:
41
+ ```ini
42
+ GROQ_API_KEY=gsk_...
43
+ # Cloud Vector DB (Optional - Recommended for Deployment)
44
+ VECTOR_DB_TYPE=pinecone
45
+ PINECONE_API_KEY=pcsk_...
46
+
47
+ # Optional:
48
+ OPENAI_API_KEY=sk-...
49
+ ```
50
+
51
+ ### 2. Generate Data
52
+ The system needs data to function. Run the ingestion script to download real datasets (WikiQA, Multi-News, GovReport) and build the index:
53
+ ```bash
54
+ make generate-data
55
+ make ingest
56
+ ```
57
+
58
+ ### 3. Run the Application
59
+
60
+ #### Option A: Docker (Recommended for Linux/Windows)
61
+ The most stable environment.
62
+ ```bash
63
+ make up
64
+ ```
65
+ * **UI**: [http://localhost:8501](http://localhost:8501)
66
+ * **API**: [http://localhost:8000/docs](http://localhost:8000/docs)
67
+
68
+ #### Option B: Local Safe Mode (Apple Silicon / Mac)
69
+ Use this if you encounter Docker connection issues or `Segmentation Fault` (FAISS/OpenMP conflicts). This mode disables FAISS and runs on **BM25 (Keyword Search) only**, ensuring stability.
70
+ ```bash
71
+ make run-local
72
+ ```
73
+ *(This starts both the FastAPI backend and Streamlit UI)*
74
+
75
+ ---
76
+
77
+ ## 📊 Evaluation
78
+
79
+ Verify the accuracy and safety of the system using the built-in evaluation suite.
80
+
81
+ ### Run Evaluation (Local Safe Mode)
82
+ ```bash
83
+ make eval-local
84
+ ```
85
+ This script will:
86
+ 1. Load the **WikiQA** test set.
87
+ 2. Run the full RAG pipeline for each question.
88
+ 3. Report:
89
+ * **Recall@10**: Retrieval effectiveness.
90
+ * **MRR**: Mean Reciprocal Rank.
91
+ * **Groundedness**: Frequency of hallucination checks passing.
92
+ * **Refusal Rate**: How often the system correctly refuses unknown questions.
93
+
94
+ ## 🏆 Performance & Results
95
+
96
+ Tested on **WikiQA**, **Multi-News**, and **GovReport** datasets.
97
+
98
+ | Metric | Score | Description |
99
+ | :--- | :--- | :--- |
100
+ | **Recall@10** | **1.0000** | Perfect retrieval of relevant documents. |
101
+ | **MRR** | **1.0000** | Relevant document consistently ranked #1. |
102
+ | **Factuality** | **1.0000** | 100% of answers grounded in context. |
103
+ | **Safety** | **100%** | Successfully refuses to answer out-of-context queries. |
104
+
105
+ ### 🛡️ Guardrails in Action
106
+
107
+ **1. Hallucination Prevention (Safety Layer)**
108
+ ![Hallucination Blocked](docs/assets/refusal_screenshot.png)
109
+ * **Scenario Refusal**: The system correctly identified that the retrieved context (about Howard Stern) was irrelevant to the "Airline Strike" query.
110
+ * **Low Confidence**: The retrieval score of `-9.73` triggered the safety gate (Threshold: `-4.0`), automatically blocking the generation.
111
+ * **Result**: Zero hallucination. The user receives a safe, honest refusal instead of a made-up answer.
112
+
113
+ **2. Enterprise Accuracy**
114
+ ![Detailed Answer](docs/assets/success_screenshot.png)
115
+ * **High Precision**: The query for "Emerging Contaminants" retrieved exact matches from the *GovReport* dataset.
116
+ * **Grounded Generation**: The answer is derived *strictly* from the text, listing specific chemicals (Perchlorate, TCE, etc.) mentioned in the document.
117
+ * **Verified**: Retrieval score of `4.82` shows high confidence, allowing the answer to pass.
118
+
119
+ ---
120
+
121
+ ## 🏗️ Architecture
122
+
123
+ ```mermaid
124
+ flowchart TD
125
+ User([User Query]) --> Hybrid{Hybrid Retrieval}
126
+
127
+ subgraph Retrieval Layer
128
+ Hybrid -->|Lexical| BM25[BM25 Index]
129
+ Hybrid -->|Semantic| FAISS[FAISS Vector DB]
130
+ BM25 --> Candidates[Candidate Pool]
131
+ FAISS --> Candidates
132
+ end
133
+
134
+ Candidates --> Rerank[Cross-Encoder Reranker]
135
+
136
+ subgraph Safety Layer
137
+ Rerank --> Gate{Confidence Gate}
138
+ Gate -- Score < -4.0 --> Refusal([Refusal Response])
139
+ Gate -- Score >= -4.0 --> Context[Context Optimization]
140
+ end
141
+
142
+ subgraph Generation Layer
143
+ Context --> Prompt[Strict System Prompt]
144
+ Prompt --> LLM[LLM Inference]
145
+ LLM --> Guard[Hallucination Check]
146
+ Guard --> Final([Final Answer])
147
+ end
148
+
149
+ style Refusal fill:#ffcccc,stroke:#333
150
+ style Final fill:#ccffcc,stroke:#333
151
+ style Gate fill:#fff4e6,stroke:#ff9900
152
+ ```
153
+
154
+ 1. **Ingestion**: Documents are cleaned, chunked (Sliding Window), and indexed into **Faiss** (Vectors) and **BM25** (Keywords).
155
+ 2. **Retrieval**: Queries retrieve candidates from both indices (`HybridRetriever`).
156
+ 3. **Reranking**: A Cross-Encoder scores the relevance of each candidate pair (`Query, Doc`).
157
+ 4. **Guardrails**:
158
+ * If `Max(Rerank Score) < -4.0`: **Refuse** immediately.
159
+ 5. **Generation**: Top documents are passed to the LLM with a strict "Context-Only" system prompt.
160
+ 6. **Validation**: Output is graded for Groundedness (Token Overlap) before being returned (in Evaluation mode).
161
+
162
+ ## 📁 Project Structure
163
+
164
+ * `src/app`: FastAPI Backend
165
+ * `src/ui`: Streamlit Frontend
166
+ * `src/pipeline`: Core RAG Logic (`QueryPipeline.py`)
167
+ * `src/retriever`: Search Algorithms (`HybridRetriever.py`)
168
+ * `src/eval`: Scoring Metrics (`Hallucination`, `Relevancy`)
169
+ * `tools`: Scripts for dataset generation and evaluation
170
+ * `data`: Raw and Indexed data storage
app.py ADDED
@@ -0,0 +1,173 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Hugging Face Spaces - Enterprise RAG System
3
+ Standalone Streamlit application
4
+ """
5
+
6
+ import streamlit as st
7
+ import os
8
+ import sys
9
+
10
+
11
+
12
+ # Add src to path
13
+ sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'src'))
14
+
15
+ from dotenv import load_dotenv
16
+ load_dotenv()
17
+
18
+ from src.pipeline.query_pipeline import QueryPipeline
19
+ from src.ingestion.ingest import IngestionPipeline
20
+ import subprocess
21
+
22
+ def prepare_data():
23
+ """Ensure data is generated and indexed if missing"""
24
+ INDEX_DIR = "data/index"
25
+ RAW_DIR = "data/raw"
26
+
27
+ # 1. Create directories
28
+ os.makedirs(INDEX_DIR, exist_ok=True)
29
+ os.makedirs(RAW_DIR, exist_ok=True)
30
+
31
+ # 2. Check if raw data exists (Only needed if NOT using Pinecone)
32
+ if os.getenv("VECTOR_DB_TYPE", "").lower() != "pinecone":
33
+ if not os.listdir(RAW_DIR):
34
+ st.error("❌ Data folder empty! Please commit your 'data/raw' folder to Git and redeploy.")
35
+ st.stop()
36
+
37
+ # 3. Check if indices exist, if not run ingestion (Skip for Pinecone)
38
+ # 3. Check if indices exist (Files should be baked in)
39
+ bm25_path = os.path.join(INDEX_DIR, "bm25.pkl")
40
+
41
+ # Only download if absolutely missing (Fallback for dev env)
42
+ if not os.path.exists(bm25_path):
43
+ with st.spinner("Downloading Knowledge Base (Dev Mode)..."):
44
+ try:
45
+ from huggingface_hub import hf_hub_download
46
+ os.makedirs(INDEX_DIR, exist_ok=True)
47
+ hf_hub_download(repo_id="yuvis/enterprise-rag-index", filename="index/bm25.pkl", repo_type="dataset", local_dir="data")
48
+ hf_hub_download(repo_id="yuvis/enterprise-rag-index", filename="index/doc_map.pkl", repo_type="dataset", local_dir="data")
49
+ except Exception:
50
+ pass
51
+
52
+ st.set_page_config(
53
+ page_title="Enterprise RAG Search",
54
+ page_icon="🔍",
55
+ layout="wide"
56
+ )
57
+
58
+ # Initialize pipeline
59
+ @st.cache_resource
60
+ def load_pipeline():
61
+ """Load the RAG pipeline (cached for performance)"""
62
+ try:
63
+ # Ensure data is ready before initializing pipeline
64
+ prepare_data()
65
+ return QueryPipeline()
66
+ except Exception as e:
67
+ st.error(f"Error loading pipeline: {e}")
68
+ st.exception(e)
69
+ return None
70
+
71
+ # Main UI
72
+ st.title("🔍 Enterprise RAG Search")
73
+ st.markdown("*Production-grade Retrieval-Augmented Generation with Hallucination Prevention*")
74
+
75
+ # Sidebar configuration
76
+ with st.sidebar:
77
+ st.header("⚙️ Configuration")
78
+ st.caption("🚀 Version: Pinecone V2")
79
+
80
+ # Check for API key
81
+ groq_key = os.getenv("GROQ_API_KEY")
82
+ if not groq_key:
83
+ st.warning("⚠️ GROQ_API_KEY not set. Please configure in Space settings.")
84
+ else:
85
+ st.success("✅ API Key configured")
86
+
87
+ st.divider()
88
+
89
+ top_k_retrieval = st.slider("Retrieval Top-K", 5, 50, 20)
90
+ top_k_rerank = st.slider("Rerank Top-K", 1, 10, 5)
91
+
92
+ st.divider()
93
+ st.markdown("### 📊 System Info")
94
+ st.info("""
95
+ - **Hybrid Search**: BM25 + FAISS
96
+ - **Reranking**: Cross-Encoder
97
+ - **Safety**: Confidence Gating
98
+ """)
99
+
100
+ # Initialize session state
101
+ if "messages" not in st.session_state:
102
+ st.session_state.messages = []
103
+
104
+ # Display chat history
105
+ for message in st.session_state.messages:
106
+ with st.chat_message(message["role"]):
107
+ st.markdown(message["content"])
108
+
109
+ # Chat input
110
+ if prompt := st.chat_input("Ask a question about your documents..."):
111
+ # Add user message
112
+ st.session_state.messages.append({"role": "user", "content": prompt})
113
+ with st.chat_message("user"):
114
+ st.markdown(prompt)
115
+
116
+ # Generate response
117
+ with st.chat_message("assistant"):
118
+ with st.spinner("Searching and generating answer..."):
119
+ pipeline = load_pipeline()
120
+
121
+ if pipeline is None:
122
+ st.error("Pipeline not loaded. Please check configuration.")
123
+ else:
124
+ try:
125
+ result = pipeline.run(
126
+ query=prompt,
127
+ top_k_retrieval=top_k_retrieval,
128
+ top_k_rerank=top_k_rerank
129
+ )
130
+
131
+ # Display answer
132
+ st.markdown(result["answer"])
133
+
134
+ # Display metadata in expander
135
+ with st.expander("📋 View Details"):
136
+ col1, col2, col3 = st.columns(3)
137
+
138
+ with col1:
139
+ st.metric("Retrieval Score", f"{result.get('retrieval_score', 'N/A'):.2f}")
140
+
141
+ with col2:
142
+ hallucination = result.get('hallucination_score', 'N/A')
143
+ if hallucination != 'N/A':
144
+ st.metric("Hallucination Score", f"{hallucination:.2f}")
145
+
146
+ with col3:
147
+ groundedness = result.get('groundedness', 'N/A')
148
+ if groundedness != 'N/A':
149
+ st.metric("Groundedness", f"{groundedness:.2f}")
150
+
151
+ # Show retrieved context
152
+ if result.get("context"):
153
+ st.markdown("**Retrieved Context:**")
154
+ for i, (doc, score) in enumerate(result["context"][:3], 1):
155
+ st.markdown(f"{i}. [Score: {score:.2f}] {doc[:200]}...")
156
+
157
+ # Add to chat history
158
+ st.session_state.messages.append({
159
+ "role": "assistant",
160
+ "content": result["answer"]
161
+ })
162
+
163
+ except Exception as e:
164
+ st.error(f"Error generating response: {e}")
165
+ st.exception(e)
166
+
167
+ # Footer
168
+ st.divider()
169
+ st.markdown("""
170
+ <div style='text-align: center; color: gray; font-size: 0.8em;'>
171
+ Enterprise RAG System | <a href='https://github.com/YuvrajSinghBhadoria2/Enterprise-RAG-System'>GitHub</a>
172
+ </div>
173
+ """, unsafe_allow_html=True)
configs/default.yaml ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ app:
2
+ title: "Enterprise RAG Search"
3
+ host: "0.0.0.0"
4
+ port: 8080
5
+
6
+ retrieval:
7
+ top_k_retrieval: 20
8
+ top_k_rerank: 5
9
+ weights:
10
+ bm25: 0.3
11
+ dense: 0.7
12
+
13
+ embeddings:
14
+ model_name: "BAAI/bge-m3"
15
+ device: "cpu" # or cuda
16
+
17
+ reranker:
18
+ model_name: "cross-encoder/ms-marco-MiniLM-L-6-v2"
19
+
20
+ ingestion:
21
+ chunk_size: 512
22
+ chunk_overlap: 50
data/raw/finphrase_000.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ Source: FinancialPhrasebank
2
+
3
+ According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .
data/raw/finphrase_001.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ Source: FinancialPhrasebank
2
+
3
+ For the last quarter of 2010 , Componenta 's net sales doubled to EUR131m from EUR76m for the same period a year earlier , while it moved to a zero pre-tax profit from a pre-tax loss of EUR7m .
data/raw/finphrase_002.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ Source: FinancialPhrasebank
2
+
3
+ In the third quarter of 2010 , net sales increased by 5.2 % to EUR 205.5 mn , and operating profit by 34.9 % to EUR 23.5 mn .
data/raw/finphrase_003.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ Source: FinancialPhrasebank
2
+
3
+ Operating profit rose to EUR 13.1 mn from EUR 8.7 mn in the corresponding period in 2007 representing 7.7 % of net sales .
data/raw/finphrase_004.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ Source: FinancialPhrasebank
2
+
3
+ Operating profit totalled EUR 21.1 mn , up from EUR 18.6 mn in 2007 , representing 9.7 % of net sales .
data/raw/finphrase_005.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ Source: FinancialPhrasebank
2
+
3
+ Finnish Talentum reports its operating profit increased to EUR 20.5 mn in 2005 from EUR 9.3 mn in 2004 , and net sales totaled EUR 103.3 mn , up from EUR 96.4 mn .
data/raw/finphrase_006.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ Source: FinancialPhrasebank
2
+
3
+ Clothing retail chain Sepp+ñl+ñ 's sales increased by 8 % to EUR 155.2 mn , and operating profit rose to EUR 31.1 mn from EUR 17.1 mn in 2004 .
data/raw/finphrase_007.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ Source: FinancialPhrasebank
2
+
3
+ Consolidated net sales increased 16 % to reach EUR74 .8 m , while operating profit amounted to EUR0 .9 m compared to a loss of EUR0 .7 m in the prior year period .
data/raw/finphrase_008.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ Source: FinancialPhrasebank
2
+
3
+ Foundries division reports its sales increased by 9.7 % to EUR 63.1 mn from EUR 57.5 mn in the corresponding period in 2006 , and sales of the Machine Shop division increased by 16.4 % to EUR 41.2 mn from EUR 35.4 mn in the corresponding period in 2006 .
data/raw/finphrase_009.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ Source: FinancialPhrasebank
2
+
3
+ HELSINKI ( AFX ) - Shares closed higher , led by Nokia after it announced plans to team up with Sanyo to manufacture 3G handsets , and by Nokian Tyres after its fourth-quarter earnings report beat analysts ' expectations , dealers said .
data/raw/finphrase_010.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ Source: FinancialPhrasebank
2
+
3
+ Its board of directors will propose a dividend of EUR0 .12 per share for 2010 , up from the EUR0 .08 per share paid in 2009 .
data/raw/finphrase_011.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ Source: FinancialPhrasebank
2
+
3
+ MegaFon 's subscriber base increased 16.1 % in 2009 to 50.5 million users as of December 31 , while its market share by the number of customers amounted to 24 % as of late 2009 , up from 23 % as of late 2008 , according to TeliaSonera estimates .
data/raw/finphrase_012.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ Source: FinancialPhrasebank
2
+
3
+ Net income from life insurance doubled to EUR 6.8 mn from EUR 3.2 mn , and net income from non-life insurance rose to EUR 5.2 mn from EUR 1.5 mn in the corresponding period in 2009 .
data/raw/finphrase_013.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ Source: FinancialPhrasebank
2
+
3
+ Net sales increased to EUR193 .3 m from EUR179 .9 m and pretax profit rose by 34.2 % to EUR43 .1 m. ( EUR1 = USD1 .4 )
data/raw/finphrase_014.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ Source: FinancialPhrasebank
2
+
3
+ Net sales surged by 18.5 % to EUR167 .8 m. Teleste said that EUR20 .4 m , or 12.2 % , of the sales came from the acquisitions made in 2009 .
data/raw/finphrase_015.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ Source: FinancialPhrasebank
2
+
3
+ Nordea Group 's operating profit increased in 2010 by 18 percent year-on-year to 3.64 billion euros and total revenue by 3 percent to 9.33 billion euros .
data/raw/finphrase_016.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ Source: FinancialPhrasebank
2
+
3
+ Operating profit for the nine-month period increased from EUR13 .6 m , while net sales increased from EUR394 .7 m , as compared to the corresponding period in 2005 .
data/raw/finphrase_017.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ Source: FinancialPhrasebank
2
+
3
+ Operating profit for the three-month period increased from EUR1 .2 m , while revenue increased from EUR20 .2 m , as compared to the corresponding period in 2005 .
data/raw/finphrase_018.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ Source: FinancialPhrasebank
2
+
3
+ The company 's net profit rose 11.4 % on the year to 82.2 million euros in 2005 on sales of 686.5 million euros , 13.8 % up on the year , the company said earlier .
data/raw/finphrase_019.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ Source: FinancialPhrasebank
2
+
3
+ The Lithuanian beer market made up 14.41 million liters in January , a rise of 0.8 percent from the year-earlier figure , the Lithuanian Brewers ' Association reporting citing the results from its members .
data/raw/finphrase_020.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ Source: FinancialPhrasebank
2
+
3
+ Viking Line 's cargo revenue increased by 5.4 % to EUR 21.46 mn , and cargo volume increased by 2.4 % to 70,116 cargo units .
data/raw/finphrase_021.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ Source: FinancialPhrasebank
2
+
3
+ The fair value of the property portfolio doubled as a result of the Kapiteeli acquisition and totalled EUR 2,686.2 1,259.7 million .
data/raw/finphrase_022.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ Source: FinancialPhrasebank
2
+
3
+ 10 February 2011 - Finnish media company Sanoma Oyj HEL : SAA1V said yesterday its 2010 net profit almost tripled to EUR297 .3 m from EUR107 .1 m for 2009 and announced a proposal for a raised payout .
data/raw/finphrase_023.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ Source: FinancialPhrasebank
2
+
3
+ A Helsinki : ELIiV today reported EPS of EUR1 .13 for 2009 , an increase over EPS of EUR1 .12 in 2008 .
data/raw/finphrase_024.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ Source: FinancialPhrasebank
2
+
3
+ Commission income increased by 22 % to EUR 4.4 mn , and lending volume rose by 13.5 % .
data/raw/finphrase_025.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ Source: FinancialPhrasebank
2
+
3
+ In January , traffic , measured in revenue passenger kilometres RPK , went up by 3.2 % and capacity , measured in available seat kilometres ASK , rose by 12.2 % .
data/raw/finphrase_026.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ Source: FinancialPhrasebank
2
+
3
+ In January-September 2010 , Fiskars ' net profit went up by 14 % year-on-year to EUR 65.4 million and net sales to EUR 525.3 million from EUR 487.7 million .
data/raw/finphrase_027.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ Source: FinancialPhrasebank
2
+
3
+ Net income from life insurance rose to EUR 16.5 mn from EUR 14.0 mn , and net income from non-life insurance to EUR 22.6 mn from EUR 15.2 mn in 2009 .
data/raw/finphrase_028.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ Source: FinancialPhrasebank
2
+
3
+ Sales have risen in other export markets .
data/raw/finphrase_029.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ Source: FinancialPhrasebank
2
+
3
+ Sales increased due to growing market rates and increased operations .
data/raw/finphrase_030.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ Source: FinancialPhrasebank
2
+
3
+ The agreement strengthens our long-term partnership with Nokia Siemens Networks .
data/raw/finphrase_031.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ Source: FinancialPhrasebank
2
+
3
+ The company 's order book stood at 1.5 bln euro $ 2.2 bln on September 30 , 2007 , up by 24.2 pct on the year , with international orders amounting to 365 mln euro $ 534.3 mln .
data/raw/finphrase_032.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ Source: FinancialPhrasebank
2
+
3
+ The company said that paper demand increased in all of its main markets , including of publication papers , and that it increased average paper prices by 4 percent compared with last year .
data/raw/finphrase_033.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ Source: FinancialPhrasebank
2
+
3
+ The world 's second largest stainless steel maker said net profit in the three-month period until Dec. 31 surged to euro603 million US$ 781 million , or euro3 .33 US$ 4.31 per share , from euro172 million , or euro0 .94 per share , the previous year .
data/raw/finphrase_034.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ Source: FinancialPhrasebank
2
+
3
+ Shares of Standard Chartered ( STAN ) rose 1.2 % in the FTSE 100 , while Royal Bank of Scotland ( RBS ) shares rose 2 % and Barclays shares ( BARC ) ( BCS ) were up 1.7 % .
data/raw/finphrase_035.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ Source: FinancialPhrasebank
2
+
3
+ Shares of Nokia Corp. rose Thursday after the cell phone maker said its third-quarter earnings almost doubled and its share of the global handset market increased .