Spaces:
Sleeping
Sleeping
| from flask import Flask, render_template, request | |
| import matplotlib | |
| matplotlib.use('Agg') | |
| import matplotlib.pyplot as plt | |
| import io | |
| import base64 | |
| import pandas as pd | |
| import google.generativeai as genai | |
| import os | |
| from docx import Document | |
| import plotly.express as px | |
| import plotly.io as pio | |
| app = Flask(__name__) | |
| app.config['UPLOAD_FOLDER'] = 'uploads' | |
| app.config['MAX_CONTENT_LENGTH'] = 16 * 1024 * 1024 # 16MB max file size | |
| # Configure Gemini API | |
| GOOGLE_API_KEY = 'AIzaSyBLcWuSj6N1bkhQsTF4kt3_hFh4ibH11pQ' # Replace with your actual API key | |
| genai.configure(api_key=GOOGLE_API_KEY) | |
| model = genai.GenerativeModel('gemini-2.0-flash') | |
| def ensure_upload_folder(): | |
| if not os.path.exists(app.config['UPLOAD_FOLDER']): | |
| os.makedirs(app.config['UPLOAD_FOLDER']) | |
| def extract_text_from_docx(file_path): | |
| doc = Document(file_path) | |
| full_text = [] | |
| for paragraph in doc.paragraphs: | |
| full_text.append(paragraph.text) | |
| return '\n'.join(full_text) | |
| def extract_data_using_gemini(text): | |
| prompt = """ | |
| Extract the event counts from the following table format in the text: | |
| 2022-2023 | |
| Cultural competitions/events: NUMBER | |
| Sports competitions/events: NUMBER | |
| Technical fest/Academic fest: NUMBER | |
| Social activities/events: NUMBER | |
| Any other events through Active clubs and forums: NUMBER | |
| 2021-2022 | |
| Cultural competitions/events: NUMBER | |
| Sports competitions/events: NUMBER | |
| Technical fest/Academic fest: NUMBER | |
| Social activities/events: NUMBER | |
| Any other events through Active clubs and forums: NUMBER | |
| 2020-2021 | |
| Cultural competitions/events: NUMBER | |
| Sports competitions/events: NUMBER | |
| Technical fest/Academic fest: NUMBER | |
| Social activities/events: NUMBER | |
| Any other events through Active clubs and forums: NUMBER | |
| 2019-2020 | |
| Cultural competitions/events: NUMBER | |
| Sports competitions/events: NUMBER | |
| Technical fest/Academic fest: NUMBER | |
| Social activities/events: NUMBER | |
| Any other events through Active clubs and forums: NUMBER | |
| 2018-2019 | |
| Cultural competitions/events: NUMBER | |
| Sports competitions/events: NUMBER | |
| Technical fest/Academic fest: NUMBER | |
| Social activities/events: NUMBER | |
| Any other events through Active clubs and forums: NUMBER | |
| Look for these exact numbers in the text. The data appears in a table with years and categories. | |
| For each year, find: | |
| - Number of Cultural competitions/events | |
| - Number of Sports competitions/events | |
| - Number of Technical fest/Academic fest events | |
| - Number of Social activities/events | |
| - Number of "Any other events through Active clubs and forums" | |
| Return the data in this exact Python dictionary format: | |
| { | |
| '2022-2023': {'Cultural': 11, 'Sports': 10, 'Technical': 29, 'Social': 15, 'Other': 20}, | |
| '2021-2022': {'Cultural': 7, 'Sports': 8, 'Technical': 13, 'Social': 12, 'Other': 15}, | |
| '2020-2021': {'Cultural': 7, 'Sports': 9, 'Technical': 15, 'Social': 10, 'Other': 17}, | |
| '2019-2020': {'Cultural': 12, 'Sports': 17, 'Technical': 21, 'Social': 14, 'Other': 11}, | |
| '2018-2019': {'Cultural': 8, 'Sports': 17, 'Technical': 15, 'Social': 11, 'Other': 9} | |
| } | |
| Important: | |
| - Use the EXACT numbers from the document | |
| - Include ALL years from 2018-2019 to 2022-2023 | |
| - Make sure to find the correct table in the document that has these numbers | |
| - Return only the Python dictionary, no other text | |
| """ | |
| try: | |
| # Print the first part of the text for debugging | |
| print("\nSearching in text:") | |
| print("=" * 50) | |
| # Look for specific patterns in text | |
| import re | |
| years = re.findall(r'(20\d{2}-20\d{2})', text) | |
| print(f"Found years: {years}") | |
| # Look for numbers near key terms | |
| cultural = re.findall(r'Cultural competitions/events\s*(\d+)', text) | |
| sports = re.findall(r'Sports competitions/events\s*(\d+)', text) | |
| technical = re.findall(r'Technical fest/Academic fest\s*(\d+)', text) | |
| other = re.findall(r'Any other events.*?(\d+)', text) | |
| social = re.findall(r'Social activities/events\s*(\d+)', text) | |
| print(f"Found cultural numbers: {cultural}") | |
| print(f"Found sports numbers: {sports}") | |
| print(f"Found technical numbers: {technical}") | |
| print(f"Found other numbers: {other}") | |
| print(f"Found social numbers: {social}") | |
| print("=" * 50) | |
| response = model.generate_content(text + "\n" + prompt) | |
| response_text = response.text.strip() | |
| # Debug print | |
| print("Raw response:", response_text) | |
| # Remove any markdown formatting | |
| if '' in response_text: | |
| response_text = response_text.split('')[1] | |
| if 'python' in response_text.split('\n')[0]: | |
| response_text = '\n'.join(response_text.split('\n')[1:]) | |
| # Clean the response text | |
| response_text = response_text.strip() | |
| print("Cleaned response:", response_text) | |
| # Parse the response | |
| try: | |
| import ast | |
| data = ast.literal_eval(response_text) | |
| except: | |
| # Fallback to JSON parsing if ast fails | |
| response_text = response_text.replace("'", '"') | |
| import json | |
| data = json.loads(response_text) | |
| # Validate data structure | |
| if not isinstance(data, dict): | |
| raise ValueError("Response is not a dictionary") | |
| # Ensure all years are present | |
| expected_years = ['2022-2023', '2021-2022', '2020-2021', '2019-2020', '2018-2019'] | |
| if not all(year in data for year in expected_years): | |
| raise ValueError("Missing some years in the data") | |
| # Ensure all categories are present for each year | |
| required_categories = {'Cultural', 'Sports', 'Technical', 'Social', 'Other'} | |
| for year in data: | |
| if not all(cat in data[year] for cat in required_categories): | |
| raise ValueError(f"Missing categories in year {year}") | |
| return data | |
| except Exception as e: | |
| print(f"Error processing with Gemini: {str(e)}") | |
| print(f"Response text was: {response_text if 'response_text' in locals() else 'No response text'}") | |
| return None | |
| def get_graph_insights(data, plot_type): | |
| """Generate detailed insights including SWOT analysis for different types of plots.""" | |
| df = pd.DataFrame(data).T | |
| if plot_type == 'bar': | |
| total_by_category = df.sum() | |
| max_category = total_by_category.idxmax() | |
| min_category = total_by_category.idxmin() | |
| avg_events = total_by_category.mean() | |
| insights = { | |
| 'main_insight': f"The most frequent event category overall is {max_category} with {int(total_by_category[max_category])} events, while {min_category} has the least with {int(total_by_category[min_category])} events.", | |
| 'swot': { | |
| 'strengths': [ | |
| f"Strong performance in {max_category} events", | |
| f"Diverse range of events across categories", | |
| f"Average of {avg_events:.1f} events per category" | |
| ], | |
| 'weaknesses': [ | |
| f"Low participation in {min_category} events", | |
| f"Uneven distribution across categories", | |
| "Potential resource allocation issues" | |
| ], | |
| 'opportunities': [ | |
| f"Room for growth in {min_category} category", | |
| "Potential for cross-category events", | |
| "Scope for balanced development" | |
| ], | |
| 'threats': [ | |
| "Risk of over-dependence on dominant category", | |
| "Resource strain in peak periods", | |
| "Sustainability challenges" | |
| ] | |
| }, | |
| 'recommendations': [ | |
| f"Consider boosting {min_category} events", | |
| "Implement balanced resource allocation", | |
| "Develop cross-category initiatives" | |
| ] | |
| } | |
| return insights | |
| elif plot_type == 'pie': | |
| latest_year = '2022-2023' | |
| year_data = data[latest_year] | |
| total = sum(year_data.values()) | |
| max_cat = max(year_data.items(), key=lambda x: x[1]) | |
| min_cat = min(year_data.items(), key=lambda x: x[1]) | |
| percentage = (max_cat[1] / total) * 100 | |
| insights = { | |
| 'main_insight': f"In {latest_year}, {max_cat[0]} events dominated with {max_cat[1]} events ({percentage:.1f}% of total events).", | |
| 'swot': { | |
| 'strengths': [ | |
| f"Strong presence in {max_cat[0]} category", | |
| "Clear category leadership", | |
| "Established event structure" | |
| ], | |
| 'weaknesses': [ | |
| f"Under-representation in {min_cat[0]} category", | |
| "Imbalanced distribution", | |
| "Resource concentration risks" | |
| ], | |
| 'opportunities': [ | |
| "Potential for category diversification", | |
| "Growth in underserved categories", | |
| "New event type development" | |
| ], | |
| 'threats': [ | |
| "Category saturation risk", | |
| "Resource allocation challenges", | |
| "Sustainability concerns" | |
| ] | |
| }, | |
| 'recommendations': [ | |
| "Diversify event portfolio", | |
| f"Strengthen {min_cat[0]} category", | |
| "Implement balanced growth strategy" | |
| ] | |
| } | |
| return insights | |
| elif plot_type == 'line': | |
| trend = "increasing" if df.iloc[-1].mean() > df.iloc[0].mean() else "decreasing" | |
| growth_rate = ((df.iloc[-1].mean() - df.iloc[0].mean()) / df.iloc[0].mean() * 100) | |
| insights = { | |
| 'main_insight': f"The overall trend shows a {trend} pattern with a {growth_rate:.1f}% change in event frequency over the years.", | |
| 'swot': { | |
| 'strengths': [ | |
| f"Consistent {trend} trend", | |
| "Clear growth trajectory", | |
| "Established pattern" | |
| ], | |
| 'weaknesses': [ | |
| "Fluctuations in growth rate", | |
| "Periodic inconsistencies", | |
| "Resource scaling challenges" | |
| ], | |
| 'opportunities': [ | |
| "Growth optimization potential", | |
| "Pattern regularization", | |
| "Strategic planning possibilities" | |
| ], | |
| 'threats': [ | |
| "Sustainability of growth rate", | |
| "Resource management challenges", | |
| "Market saturation risks" | |
| ] | |
| }, | |
| 'recommendations': [ | |
| "Develop sustainable growth plan", | |
| "Implement resource scaling strategy", | |
| "Monitor growth patterns" | |
| ] | |
| } | |
| return insights | |
| elif plot_type == 'growth': | |
| growth_rates = df.pct_change() * 100 | |
| avg_growth = growth_rates.mean().mean() | |
| max_growth = growth_rates.max().max() | |
| min_growth = growth_rates.min().min() | |
| insights = { | |
| 'main_insight': f"The average year-over-year growth rate is {avg_growth:.1f}%, with peaks of {max_growth:.1f}% and lows of {min_growth:.1f}%.", | |
| 'swot': { | |
| 'strengths': [ | |
| "Positive average growth rate", | |
| "Strong peak performance periods", | |
| "Growth momentum" | |
| ], | |
| 'weaknesses': [ | |
| "Growth rate volatility", | |
| "Negative growth periods", | |
| "Inconsistent patterns" | |
| ], | |
| 'opportunities': [ | |
| "Growth stabilization potential", | |
| "Performance optimization", | |
| "Strategic growth planning" | |
| ], | |
| 'threats': [ | |
| "Growth sustainability", | |
| "Resource scaling challenges", | |
| "Market fluctuations" | |
| ] | |
| }, | |
| 'recommendations': [ | |
| "Stabilize growth patterns", | |
| "Develop contingency plans", | |
| "Implement growth monitoring" | |
| ] | |
| } | |
| return insights | |
| elif plot_type == 'area': | |
| total_growth = ((df.iloc[-1].sum() - df.iloc[0].sum()) / df.iloc[0].sum() * 100) | |
| avg_yearly_growth = total_growth / (len(df) - 1) | |
| insights = { | |
| 'main_insight': f"The cumulative events show a {total_growth:.1f}% total change, averaging {avg_yearly_growth:.1f}% yearly growth.", | |
| 'swot': { | |
| 'strengths': [ | |
| "Consistent cumulative growth", | |
| "Strong overall trajectory", | |
| "Clear progress pattern" | |
| ], | |
| 'weaknesses': [ | |
| "Growth rate variations", | |
| "Resource scaling challenges", | |
| "Potential sustainability issues" | |
| ], | |
| 'opportunities': [ | |
| "Long-term growth potential", | |
| "Pattern optimization", | |
| "Strategic expansion" | |
| ], | |
| 'threats': [ | |
| "Scaling challenges", | |
| "Resource constraints", | |
| "Market saturation" | |
| ] | |
| }, | |
| 'recommendations': [ | |
| "Develop long-term growth strategy", | |
| "Implement resource planning", | |
| "Monitor cumulative trends" | |
| ] | |
| } | |
| return insights | |
| return { | |
| 'main_insight': "No specific insights available for this visualization.", | |
| 'swot': { | |
| 'strengths': [], | |
| 'weaknesses': [], | |
| 'opportunities': [], | |
| 'threats': [] | |
| }, | |
| 'recommendations': [] | |
| } | |
| def create_plots(data): | |
| plots = {} | |
| df = pd.DataFrame(data).T | |
| # Bar Chart | |
| fig1 = px.bar(df, barmode='group', title='Events Distribution Across Years') | |
| plots['bar'] = { | |
| 'plot': pio.to_html(fig1, full_html=False), | |
| 'insight': get_graph_insights(data, 'bar') | |
| } | |
| # Pie Chart | |
| latest_year = '2022-2023' | |
| fig2 = px.pie(names=data[latest_year].keys(), values=data[latest_year].values(), | |
| title=f'Event Distribution for {latest_year}') | |
| plots['pie'] = { | |
| 'plot': pio.to_html(fig2, full_html=False), | |
| 'insight': get_graph_insights(data, 'pie') | |
| } | |
| # Line Chart | |
| fig3 = px.line(df, markers=True, title='Event Trends Over Years') | |
| plots['line'] = { | |
| 'plot': pio.to_html(fig3, full_html=False), | |
| 'insight': get_graph_insights(data, 'line') | |
| } | |
| # Growth Rate Chart | |
| growth_rates = df.pct_change() * 100 | |
| fig4 = px.bar(growth_rates, title='Year-over-Year Growth Rate by Category') | |
| plots['growth'] = { | |
| 'plot': pio.to_html(fig4, full_html=False), | |
| 'insight': get_graph_insights(data, 'growth') | |
| } | |
| # Area Chart | |
| fig5 = px.area(df, title='Cumulative Events Distribution') | |
| plots['area'] = { | |
| 'plot': pio.to_html(fig5, full_html=False), | |
| 'insight': get_graph_insights(data, 'area') | |
| } | |
| # Statistical Analysis | |
| stats = { | |
| 'total_events': df.sum().sum(), | |
| 'avg_events_per_year': df.sum(axis=1).mean().round(2), | |
| 'most_active_year': df.sum(axis=1).idxmax(), | |
| 'most_common_category': df.sum().idxmax(), | |
| 'growth_analysis': { | |
| 'total_growth': ((df.iloc[-1].sum() - df.iloc[0].sum()) / df.iloc[0].sum() * 100).round(2), | |
| 'category_growth': ((df.iloc[-1] - df.iloc[0]) / df.iloc[0] * 100).round(2).to_dict() | |
| } | |
| } | |
| plots['stats'] = stats | |
| return plots | |
| def index(): | |
| plots = None | |
| error_message = None | |
| if request.method == 'POST': | |
| if 'document' not in request.files: | |
| error_message = 'No file uploaded' | |
| return render_template('index.html', error=error_message) | |
| file = request.files['document'] | |
| if file.filename == '': | |
| error_message = 'No file selected' | |
| return render_template('index.html', error=error_message) | |
| if file and file.filename.endswith('.docx'): | |
| ensure_upload_folder() | |
| file_path = os.path.join(app.config['UPLOAD_FOLDER'], file.filename) | |
| file.save(file_path) | |
| try: | |
| text = extract_text_from_docx(file_path) | |
| data = extract_data_using_gemini(text) | |
| print("Extracted data:", data) | |
| if data: | |
| plots = create_plots(data) | |
| else: | |
| error_message = 'Could not extract data from document. Please check the document format.' | |
| os.remove(file_path) | |
| except Exception as e: | |
| error_message = f'Error processing document: {str(e)}' | |
| print(f"Full error: {str(e)}") | |
| else: | |
| error_message = 'Please upload a .docx file' | |
| return render_template('index.html', plots=plots, error=error_message) | |
| if __name__ == '__main__': | |
| app.run(debug=True, port=5001) |