Merge pull request 'feature/init_app' (#1) from feature/init_app into main
Reviewed-on: #1
This commit was merged in pull request #1.
This commit is contained in:
5
.dockerignore
Normal file
5
.dockerignore
Normal file
@@ -0,0 +1,5 @@
|
||||
__pycache__
|
||||
*.pyc
|
||||
*.pyo
|
||||
.git
|
||||
.DS_Store
|
||||
0
.gitea/workflows/pipeline.yml
Normal file
0
.gitea/workflows/pipeline.yml
Normal file
5
.gitignore
vendored
5
.gitignore
vendored
@@ -3,9 +3,11 @@
|
||||
.DS_Store
|
||||
.AppleDouble
|
||||
.LSOverride
|
||||
.env
|
||||
|
||||
# Icon must end with two \r
|
||||
Icon
|
||||
Icon
|
||||
|
||||
|
||||
# Thumbnails
|
||||
._*
|
||||
@@ -227,4 +229,3 @@ cython_debug/
|
||||
|
||||
# PyPI configuration file
|
||||
.pypirc
|
||||
.pypirc
|
||||
|
||||
19
Dockerfile
Normal file
19
Dockerfile
Normal file
@@ -0,0 +1,19 @@
|
||||
# Используем официальный образ Apache Airflow
|
||||
FROM apache/airflow:2.7.1-python3.9 as build
|
||||
|
||||
ENV USERNAME=airflow
|
||||
|
||||
# Устанавливаем переменные окружения для Airflow
|
||||
ENV AIRFLOW_HOME=/opt/airflow
|
||||
|
||||
COPY --chown=airflow:airflow scripts/init_airflow.sh /usr/local/bin/
|
||||
|
||||
# Копируем файл зависимостей
|
||||
COPY dags ${AIRFLOW_HOME}/dags/
|
||||
|
||||
WORKDIR ${AIRFLOW_HOME}
|
||||
|
||||
# Устанавливаем дополнительные Python-зависимости
|
||||
RUN pip install --no-cache-dir -r dags/requirements.txt
|
||||
|
||||
RUN chmod +x /usr/local/bin/init_airflow.sh
|
||||
38
README.md
38
README.md
@@ -1,3 +1,37 @@
|
||||
# airflow_neo4j
|
||||
### Пример использования airflow с подключением к БД neo4j
|
||||
|
||||
Airflow с подключением к БД Neo4j
|
||||
### Как развернуть проект
|
||||
|
||||
1. **Клонируйте репозиторий:**
|
||||
|
||||
```bash
|
||||
git clone https://git.ooru.ru/radik/airflow_neo4j.git
|
||||
cd airflow_neo4j
|
||||
```
|
||||
|
||||
2. **Скопируйте и настройте файл переменных окружения:**
|
||||
|
||||
Переименуйте файл `env.example` в `.env` и обязательно заполните все переменные в файле (например, пароли и данные для подключения к Neo4j и Postgres).
|
||||
|
||||
```bash
|
||||
cp env.example .env
|
||||
# Откройте .env и укажите свои значения
|
||||
```
|
||||
|
||||
3. **Запустите сервисы через Docker Compose:**
|
||||
|
||||
```bash
|
||||
docker-compose up --build
|
||||
```
|
||||
|
||||
4. **Дождитесь сообщения в логе**:
|
||||
|
||||
В процессе инициализации дождитесь строки:
|
||||
|
||||
Initialization complete! Starting webserver...
|
||||
|
||||
После этого интерфейс Airflow будет доступен по адресу: http://ip:8080
|
||||
|
||||
5. **Первый запуск DAG**
|
||||
|
||||
При первом запуске дага откройте его в Airflow UI и включите тумблер (On). Не нажимайте кнопку запуска (Play) вручную — дождитесь автоматического срабатывания по расписанию или другим триггерам.
|
||||
|
||||
0
__init__.py
Normal file
0
__init__.py
Normal file
80
dags/dag_01.py
Normal file
80
dags/dag_01.py
Normal file
@@ -0,0 +1,80 @@
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
from airflow import DAG
|
||||
from airflow.models import Variable
|
||||
from airflow.operators.empty import EmptyOperator
|
||||
from airflow.operators.python import PythonOperator
|
||||
from task_01.composites import UserActionTransferTask
|
||||
from task_01.target_db.utills import get_driver
|
||||
|
||||
# Аргументы DAG по умолчанию
|
||||
default_args = {
|
||||
'owner': 'data_engineer',
|
||||
'depends_on_past': False,
|
||||
'start_date': datetime(2025, 11, 1),
|
||||
'email_on_failure': False,
|
||||
'email_on_retry': False,
|
||||
'retries': 1,
|
||||
'retry_delay': timedelta(minutes=5),
|
||||
'execution_timeout': timedelta(minutes=30),
|
||||
}
|
||||
|
||||
# Определение DAG
|
||||
with DAG(
|
||||
'neo4j_healthcheck_and_sample_loader',
|
||||
default_args=default_args,
|
||||
schedule_interval='0 9 * * *',
|
||||
catchup=False,
|
||||
tags=['neo4j', 'healthcheck', 'data_loading', 'vers.01'],
|
||||
description='DAG для проверки здоровья Neo4j и загрузки тестовых данных',
|
||||
) as dag:
|
||||
conn_id = Variable.get("CONN_ID")
|
||||
driver = get_driver(conn_id)
|
||||
|
||||
task = UserActionTransferTask(driver)
|
||||
|
||||
# Начальная задача
|
||||
start_task = EmptyOperator(
|
||||
task_id='start_task',
|
||||
dag=dag,
|
||||
)
|
||||
|
||||
# Задача проверки соединения с Neo4j
|
||||
check_neo4j_connection_task = PythonOperator(
|
||||
task_id='check_neo4j_connection',
|
||||
python_callable=task.run.check_neo4j_connection,
|
||||
dag=dag,
|
||||
retries=3,
|
||||
retry_delay=timedelta(minutes=1),
|
||||
doc_md="Проверяет соединение с Neo4j и возвращает сообщение о состоянии соединения",
|
||||
)
|
||||
|
||||
# Задача генерации тестовых данных
|
||||
generate_sample_data_task = PythonOperator(
|
||||
task_id='generate_sample_data',
|
||||
python_callable=task.run.generate_sample_data,
|
||||
dag=dag,
|
||||
retries=3,
|
||||
retry_delay=timedelta(minutes=1),
|
||||
doc_md="Генерирует тестовые данные, сохраняет их в CSV файл ",
|
||||
)
|
||||
|
||||
# Задача загрузки данных в Neo4j
|
||||
load_data_to_neo4j_task = PythonOperator(
|
||||
task_id='load_data_to_neo4j',
|
||||
python_callable=task.run.load_data_to_neo4j,
|
||||
dag=dag,
|
||||
retries=3,
|
||||
retry_delay=timedelta(minutes=1),
|
||||
doc_md="Загружает данные из CSV файла в БД",
|
||||
)
|
||||
|
||||
# Финальная задача
|
||||
end_task = EmptyOperator(
|
||||
task_id='end_task',
|
||||
dag=dag,
|
||||
)
|
||||
|
||||
# Определение зависимостей задач
|
||||
(start_task >> check_neo4j_connection_task >> generate_sample_data_task >>
|
||||
load_data_to_neo4j_task >> end_task)
|
||||
4
dags/requirements.txt
Normal file
4
dags/requirements.txt
Normal file
@@ -0,0 +1,4 @@
|
||||
apache-airflow~=2.7.1
|
||||
apache-airflow-providers-neo4j
|
||||
pandas
|
||||
Flask-Session<0.6
|
||||
0
dags/task_01/__init__.py
Normal file
0
dags/task_01/__init__.py
Normal file
22
dags/task_01/composites.py
Normal file
22
dags/task_01/composites.py
Normal file
@@ -0,0 +1,22 @@
|
||||
from task_01.target_db.repositories import TargetDBRepo
|
||||
from task_01.tasks import UserActionTransfer
|
||||
|
||||
|
||||
class Adapters:
|
||||
|
||||
def __init__(self, driver):
|
||||
self.driver = driver
|
||||
|
||||
@property
|
||||
def target_repo(self):
|
||||
return TargetDBRepo(self.driver)
|
||||
|
||||
|
||||
class UserActionTransferTask:
|
||||
|
||||
def __init__(self, driver):
|
||||
self.adapters = Adapters(driver)
|
||||
|
||||
@property
|
||||
def run(self):
|
||||
return UserActionTransfer(target_db=self.adapters.target_repo, )
|
||||
8
dags/task_01/dto.py
Normal file
8
dags/task_01/dto.py
Normal file
@@ -0,0 +1,8 @@
|
||||
from dataclasses import dataclass
|
||||
|
||||
|
||||
@dataclass
|
||||
class ConnectionInfo:
|
||||
uri: str
|
||||
username: str
|
||||
password: str
|
||||
0
dags/task_01/target_db/__init__.py
Normal file
0
dags/task_01/target_db/__init__.py
Normal file
41
dags/task_01/target_db/repositories.py
Normal file
41
dags/task_01/target_db/repositories.py
Normal file
@@ -0,0 +1,41 @@
|
||||
import logging
|
||||
|
||||
from neo4j import Driver
|
||||
from pandas import DataFrame
|
||||
|
||||
BATCH_SIZE = 1000
|
||||
|
||||
|
||||
class TargetDBRepo:
|
||||
|
||||
def __init__(self, driver: Driver):
|
||||
self.driver = driver
|
||||
self.log = logging.getLogger(__name__)
|
||||
|
||||
def save_users(self, users: DataFrame) -> None:
|
||||
query = """
|
||||
UNWIND $rows AS row
|
||||
CREATE (u:User {
|
||||
user_id: row.user_id,
|
||||
action: row.action,
|
||||
timestamp: row.timestamp
|
||||
})
|
||||
"""
|
||||
|
||||
with self.driver.session() as session:
|
||||
for i in range(0, len(users), BATCH_SIZE):
|
||||
batch = users.iloc[i:i + BATCH_SIZE]
|
||||
records = batch.to_dict(orient="records")
|
||||
session.run(query, {"rows": records})
|
||||
self.log.info("Rows %s–%s saved (%s)", i, i+len(batch), len(batch))
|
||||
|
||||
def get_number_of_users(self) -> int:
|
||||
with self.driver.session() as session:
|
||||
result = session.run(
|
||||
"MATCH (u:User) RETURN count(u) as user_count")
|
||||
return result.single()["user_count"]
|
||||
|
||||
def check_connection(self):
|
||||
with self.driver.session() as session:
|
||||
result = session.run('RETURN "Connection successful" AS message')
|
||||
return result.single()["message"]
|
||||
21
dags/task_01/target_db/utills.py
Normal file
21
dags/task_01/target_db/utills.py
Normal file
@@ -0,0 +1,21 @@
|
||||
from airflow.hooks.base import BaseHook
|
||||
from neo4j import Driver, GraphDatabase
|
||||
from task_01.dto import ConnectionInfo
|
||||
|
||||
|
||||
def get_neo4j_connection(conn_id: str) -> ConnectionInfo:
|
||||
conn = BaseHook.get_connection(conn_id)
|
||||
uri = f'bolt://{conn.host}:{conn.port}'
|
||||
return ConnectionInfo(
|
||||
uri,
|
||||
username=conn.login,
|
||||
password=conn.password,
|
||||
)
|
||||
|
||||
|
||||
def get_driver(conn_id: str) -> Driver:
|
||||
connection_info = get_neo4j_connection(conn_id)
|
||||
return GraphDatabase.driver(
|
||||
uri=connection_info.uri,
|
||||
auth=(connection_info.username, connection_info.password),
|
||||
)
|
||||
100
dags/task_01/tasks.py
Normal file
100
dags/task_01/tasks.py
Normal file
@@ -0,0 +1,100 @@
|
||||
import logging
|
||||
import os
|
||||
import tempfile
|
||||
from datetime import datetime
|
||||
from random import randint
|
||||
|
||||
import pandas as pd
|
||||
from airflow import AirflowException
|
||||
from airflow.models import Variable
|
||||
from task_01.target_db.repositories import TargetDBRepo
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class UserActionTransfer:
|
||||
"""
|
||||
Класс для работы с переносом пользовательских действий в Neo4j.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
target_db: TargetDBRepo,
|
||||
):
|
||||
self.target_db = target_db
|
||||
self.log = logging.getLogger(__name__)
|
||||
|
||||
def generate_sample_data(self):
|
||||
"""
|
||||
Генерирует тестовые данные действий пользователей и сохраняет их в CSV-файл.
|
||||
"""
|
||||
actions = self.get_fake_user_action_data()
|
||||
csv_file_path = self.get_csv_file_path()
|
||||
actions.to_csv(csv_file_path, index=False)
|
||||
|
||||
# Сохраняем путь к файлу в переменную DAG
|
||||
Variable.set("user_action_data_path", csv_file_path)
|
||||
|
||||
logging.info("Sample data generated and saved to: %s", csv_file_path)
|
||||
logging.info(f"Data preview: %s", actions.head())
|
||||
|
||||
def load_data_to_neo4j(self) -> None:
|
||||
"""
|
||||
Загружает данные из CSV-файла во временной директории в базу данных Neo4j.
|
||||
"""
|
||||
csv_file_path = Variable.get("user_action_data_path")
|
||||
|
||||
if not csv_file_path or not os.path.exists(csv_file_path):
|
||||
raise AirflowException("CSV file not found: %s", csv_file_path)
|
||||
|
||||
# Чтение CSV файла
|
||||
user_actions = pd.read_csv(csv_file_path)
|
||||
logging.info("Loaded CSV data with %s rows", len(user_actions))
|
||||
|
||||
self.target_db.save_users(user_actions)
|
||||
total_rows = self.target_db.get_number_of_users()
|
||||
logging.info("Total rows: %s", total_rows)
|
||||
|
||||
# Очистка временного файла
|
||||
os.remove(csv_file_path)
|
||||
logging.info("Temporary CSV file cleaned up")
|
||||
|
||||
def check_neo4j_connection(self):
|
||||
"""
|
||||
Проверяет соединение с базой данных Neo4j.
|
||||
"""
|
||||
try:
|
||||
result = self.target_db.check_connection()
|
||||
log.info(f"Neo4j message: {result}")
|
||||
log.info("Neo4j connection is healthy")
|
||||
except Exception as e:
|
||||
log.error(f"Neo4j connection failed: {e}")
|
||||
raise
|
||||
return result
|
||||
|
||||
@staticmethod
|
||||
def get_csv_file_path() -> str:
|
||||
"""
|
||||
Возвращает путь к CSV-файлу во временной директории для пользовательских действий.
|
||||
"""
|
||||
temp_dir = tempfile.gettempdir()
|
||||
return os.path.join(temp_dir, "user_action_data.csv")
|
||||
|
||||
@staticmethod
|
||||
def get_fake_user_action_data() -> pd.DataFrame:
|
||||
"""
|
||||
Генерирует случайные тестовые данные действий пользователей.
|
||||
"""
|
||||
actions = ["login", "purchase", "view", "logout", "search"]
|
||||
ids = list(range(1, 12491))
|
||||
action = []
|
||||
timestamp = []
|
||||
for _ in ids:
|
||||
action.append(actions[randint(0, len(actions) - 1)])
|
||||
timestamp.append(datetime.now())
|
||||
sample_data = {
|
||||
"user_id": ids,
|
||||
"action": action,
|
||||
"timestamp": timestamp
|
||||
}
|
||||
return pd.DataFrame(sample_data)
|
||||
65
docker-compose.yml
Normal file
65
docker-compose.yml
Normal file
@@ -0,0 +1,65 @@
|
||||
version: '3.8'
|
||||
|
||||
services:
|
||||
postgres:
|
||||
image: postgres:14
|
||||
env_file: .env
|
||||
environment:
|
||||
POSTGRES_USER: ${POSTGRES_USER}
|
||||
POSTGRES_PASSWORD: ${POSTGRES_PASSWORD}
|
||||
POSTGRES_DB: ${POSTGRES_DB}
|
||||
volumes:
|
||||
- postgres_data:/var/lib/postgresql/data
|
||||
restart: always
|
||||
|
||||
neo4j:
|
||||
image: neo4j:5
|
||||
environment:
|
||||
- NEO4J_AUTH=${NEO4J_USER}/${NEO4J_PASSWORD}
|
||||
ports:
|
||||
- "7474:7474"
|
||||
- "7687:7687"
|
||||
volumes:
|
||||
- neo4j_data:/data
|
||||
restart: always
|
||||
|
||||
airflow-webserver:
|
||||
build: .
|
||||
depends_on:
|
||||
- postgres
|
||||
- neo4j
|
||||
env_file: .env
|
||||
environment:
|
||||
AIRFLOW__CORE__EXECUTOR: LocalExecutor
|
||||
AIRFLOW__DATABASE__SQL_ALCHEMY_CONN: postgresql+psycopg2://${POSTGRES_USER}:${POSTGRES_PASSWORD}@postgres/${POSTGRES_DB}
|
||||
AIRFLOW__CORE__LOAD_EXAMPLES: 'False'
|
||||
volumes:
|
||||
- ./dags:/opt/airflow/dags
|
||||
- ./logs:/opt/airflow/logs
|
||||
- ./plugins:/opt/airflow/plugins
|
||||
ports:
|
||||
- "8080:8080"
|
||||
command: bash -c "/usr/local/bin/init_airflow.sh"
|
||||
restart: always
|
||||
|
||||
airflow-scheduler:
|
||||
build: .
|
||||
depends_on:
|
||||
- postgres
|
||||
- neo4j
|
||||
- airflow-webserver
|
||||
env_file: .env
|
||||
environment:
|
||||
AIRFLOW__CORE__EXECUTOR: LocalExecutor
|
||||
AIRFLOW__DATABASE__SQL_ALCHEMY_CONN: postgresql+psycopg2://${POSTGRES_USER}:${POSTGRES_PASSWORD}@postgres/${POSTGRES_DB}
|
||||
AIRFLOW__CORE__LOAD_EXAMPLES: 'False'
|
||||
volumes:
|
||||
- ./dags:/opt/airflow/dags
|
||||
- ./logs:/opt/airflow/logs
|
||||
- ./plugins:/opt/airflow/plugins
|
||||
command: scheduler
|
||||
restart: always
|
||||
|
||||
volumes:
|
||||
postgres_data:
|
||||
neo4j_data:
|
||||
20
env.example
Normal file
20
env.example
Normal file
@@ -0,0 +1,20 @@
|
||||
# Airflow credentials
|
||||
AIRFLOW_ADMIN_USER=admin
|
||||
AIRFLOW_ADMIN_PASSWORD=admin
|
||||
AIRFLOW_ADMIN_EMAIL=admin@example.com
|
||||
AIRFLOW_ADMIN_FIRSTNAME=Admin
|
||||
AIRFLOW_ADMIN_LASTNAME=User
|
||||
|
||||
# Database (Postgres)
|
||||
POSTGRES_USER=airflow
|
||||
POSTGRES_PASSWORD=airflow
|
||||
POSTGRES_DB=airflow
|
||||
|
||||
# Neo4j connection details
|
||||
NEO4J_CONN_ID=
|
||||
NEO4J_URI=
|
||||
NEO4J_USER=
|
||||
NEO4J_PASSWORD=
|
||||
NEO4J_PORT=7687
|
||||
|
||||
AIRFLOW_HOME=/opt/airflow
|
||||
54
scripts/init_airflow.sh
Normal file
54
scripts/init_airflow.sh
Normal file
@@ -0,0 +1,54 @@
|
||||
#!/usr/bin/env bash
|
||||
set -e
|
||||
|
||||
echo "Initializing Airflow..."
|
||||
|
||||
# Инициализация БД (только если первый запуск)
|
||||
airflow db migrate
|
||||
|
||||
# Создание пользователя-админа
|
||||
if ! airflow users list | grep -q "${AIRFLOW_ADMIN_EMAIL}"; then
|
||||
echo "Creating Airflow admin user..."
|
||||
airflow users create \
|
||||
--username "${AIRFLOW_ADMIN_USER}" \
|
||||
--firstname "${AIRFLOW_ADMIN_FIRSTNAME}" \
|
||||
--lastname "${AIRFLOW_ADMIN_LASTNAME}" \
|
||||
--role Admin \
|
||||
--email "${AIRFLOW_ADMIN_EMAIL}" \
|
||||
--password "${AIRFLOW_ADMIN_PASSWORD}"
|
||||
else
|
||||
echo "✅ Admin user already exists."
|
||||
fi
|
||||
|
||||
# Создание подключения к Neo4j
|
||||
if ! airflow connections list | grep -q "${NEO4J_CONN_ID}"; then
|
||||
echo "🔌 Creating Neo4j connection..."
|
||||
airflow connections add "${NEO4J_CONN_ID}" \
|
||||
--conn-type neo4j \
|
||||
--conn-host "${NEO4J_URI}" \
|
||||
--conn-login "${NEO4J_USER}" \
|
||||
--conn-password "${NEO4J_PASSWORD}" \
|
||||
--conn-port "${NEO4J_PORT}"
|
||||
else
|
||||
echo "✅ Neo4j connection already exists."
|
||||
fi
|
||||
|
||||
# Установка Airflow Variables из .env
|
||||
echo "🧩 Setting Airflow Variables..."
|
||||
|
||||
# Helper для установки переменной, если её ещё нет
|
||||
function set_variable() {
|
||||
local key=$1
|
||||
local value=$2
|
||||
if ! airflow variables get "$key" &>/dev/null; then
|
||||
airflow variables set "$key" "$value"
|
||||
echo " ➕ $key = $value"
|
||||
else
|
||||
echo " ✅ $key already exists"
|
||||
fi
|
||||
}
|
||||
|
||||
set_variable "CONN_ID" "${NEO4J_CONN_ID}"
|
||||
|
||||
echo "🎉 Initialization complete! Starting webserver..."
|
||||
exec airflow webserver
|
||||
Reference in New Issue
Block a user