добавляет dag
This commit is contained in:
78
dags/dag_01.py
Normal file
78
dags/dag_01.py
Normal file
@@ -0,0 +1,78 @@
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
from airflow import DAG
|
||||
from airflow.models import Variable
|
||||
from airflow.operators.empty import EmptyOperator
|
||||
from airflow.operators.python import PythonOperator
|
||||
from task_01.composites import UserActionTransferTask
|
||||
from task_01.target_db.utills import get_driver
|
||||
|
||||
# Аргументы DAG по умолчанию
|
||||
default_args = {
|
||||
'owner': 'data_engineer',
|
||||
'depends_on_past': False,
|
||||
'start_date': datetime(2025, 11, 1),
|
||||
'email_on_failure': False,
|
||||
'email_on_retry': False,
|
||||
'retries': 1,
|
||||
'retry_delay': timedelta(minutes=5),
|
||||
'execution_timeout': timedelta(minutes=30),
|
||||
}
|
||||
|
||||
# Определение DAG
|
||||
with DAG(
|
||||
'neo4j_health_check_dag',
|
||||
default_args=default_args,
|
||||
schedule_interval='0 9 * * *',
|
||||
catchup=False,
|
||||
tags=['neo4j', 'healthcheck', 'data_loading', 'vers.01'],
|
||||
description=
|
||||
'DAG для проверки здоровья Neo4j и загрузки тестовых данных',
|
||||
) as dag:
|
||||
conn_id = Variable.get("CONN_ID")
|
||||
driver = get_driver(conn_id)
|
||||
|
||||
task = UserActionTransferTask(driver)
|
||||
|
||||
# Начальная задача
|
||||
start_task = EmptyOperator(
|
||||
task_id='start_task',
|
||||
dag=dag,
|
||||
)
|
||||
|
||||
# Задача проверки соединения с Neo4j
|
||||
check_neo4j_connection_task = PythonOperator(
|
||||
task_id='check_neo4j_connection',
|
||||
python_callable=task.run.check_neo4j_connection,
|
||||
dag=dag,
|
||||
retries=3,
|
||||
retry_delay=timedelta(minutes=1),
|
||||
)
|
||||
|
||||
# Задача генерации тестовых данных
|
||||
generate_sample_data_task = PythonOperator(
|
||||
task_id='generate_sample_data',
|
||||
python_callable=task.run.generate_sample_data,
|
||||
dag=dag,
|
||||
retries=3,
|
||||
retry_delay=timedelta(minutes=1),
|
||||
)
|
||||
|
||||
# Задача загрузки данных в Neo4j
|
||||
load_data_to_neo4j_task = PythonOperator(
|
||||
task_id='load_data_to_neo4j',
|
||||
python_callable=task.run.load_data_to_neo4j,
|
||||
dag=dag,
|
||||
retries=3,
|
||||
retry_delay=timedelta(minutes=1),
|
||||
)
|
||||
|
||||
# Финальная задача
|
||||
end_task = EmptyOperator(
|
||||
task_id='end_task',
|
||||
dag=dag,
|
||||
)
|
||||
|
||||
# Определение зависимостей задач
|
||||
(start_task >> check_neo4j_connection_task >> generate_sample_data_task >>
|
||||
load_data_to_neo4j_task >> end_task)
|
||||
4
dags/requirements.txt
Normal file
4
dags/requirements.txt
Normal file
@@ -0,0 +1,4 @@
|
||||
apache-airflow~=2.7.1
|
||||
apache-airflow-providers-neo4j
|
||||
pandas
|
||||
Flask-Session<0.6
|
||||
0
dags/task_01/__init__.py
Normal file
0
dags/task_01/__init__.py
Normal file
22
dags/task_01/composites.py
Normal file
22
dags/task_01/composites.py
Normal file
@@ -0,0 +1,22 @@
|
||||
from task_01.target_db.repositories import TargetDBRepo
|
||||
from task_01.tasks import UserActionTransfer
|
||||
|
||||
|
||||
class Adapters:
|
||||
|
||||
def __init__(self, driver):
|
||||
self.driver = driver
|
||||
|
||||
@property
|
||||
def target_repo(self):
|
||||
return TargetDBRepo(self.driver)
|
||||
|
||||
|
||||
class UserActionTransferTask:
|
||||
|
||||
def __init__(self, driver):
|
||||
self.adapters = Adapters(driver)
|
||||
|
||||
@property
|
||||
def run(self):
|
||||
return UserActionTransfer(target_db=self.adapters.target_repo, )
|
||||
8
dags/task_01/dto.py
Normal file
8
dags/task_01/dto.py
Normal file
@@ -0,0 +1,8 @@
|
||||
from dataclasses import dataclass
|
||||
|
||||
|
||||
@dataclass
|
||||
class ConnectionInfo:
|
||||
uri: str
|
||||
username: str
|
||||
password: str
|
||||
0
dags/task_01/target_db/__init__.py
Normal file
0
dags/task_01/target_db/__init__.py
Normal file
41
dags/task_01/target_db/repositories.py
Normal file
41
dags/task_01/target_db/repositories.py
Normal file
@@ -0,0 +1,41 @@
|
||||
import logging
|
||||
|
||||
from neo4j import Driver
|
||||
from pandas import DataFrame
|
||||
|
||||
BATCH_SIZE = 1000
|
||||
|
||||
|
||||
class TargetDBRepo:
|
||||
|
||||
def __init__(self, driver: Driver):
|
||||
self.driver = driver
|
||||
self.log = logging.getLogger(__name__)
|
||||
|
||||
def save_users(self, users: DataFrame) -> None:
|
||||
query = """
|
||||
UNWIND $rows AS row
|
||||
CREATE (u:User {
|
||||
user_id: row.user_id,
|
||||
action: row.action,
|
||||
timestamp: row.timestamp
|
||||
})
|
||||
"""
|
||||
|
||||
with self.driver.session() as session:
|
||||
for i in range(0, len(users), BATCH_SIZE):
|
||||
batch = users.iloc[i:i + BATCH_SIZE]
|
||||
records = batch.to_dict(orient="records")
|
||||
session.run(query, {"rows": records})
|
||||
self.log.info("rows saved %s", i + BATCH_SIZE)
|
||||
|
||||
def get_number_of_users(self) -> int:
|
||||
with self.driver.session() as session:
|
||||
result = session.run(
|
||||
"MATCH (u:User) RETURN count(u) as user_count")
|
||||
return result.single()["user_count"]
|
||||
|
||||
def check_connection(self):
|
||||
with self.driver.session() as session:
|
||||
result = session.run('RETURN "Connection successful" AS message')
|
||||
return result.single()["message"]
|
||||
21
dags/task_01/target_db/utills.py
Normal file
21
dags/task_01/target_db/utills.py
Normal file
@@ -0,0 +1,21 @@
|
||||
from airflow.hooks.base import BaseHook
|
||||
from neo4j import Driver, GraphDatabase
|
||||
from task_01.dto import ConnectionInfo
|
||||
|
||||
|
||||
def get_neo4j_connection(conn_id: str) -> ConnectionInfo:
|
||||
conn = BaseHook.get_connection(conn_id)
|
||||
uri = f'bolt://{conn.host}:{conn.port}'
|
||||
return ConnectionInfo(
|
||||
uri,
|
||||
username=conn.login,
|
||||
password=conn.password,
|
||||
)
|
||||
|
||||
|
||||
def get_driver(conn_id: str) -> Driver:
|
||||
connection_info = get_neo4j_connection(conn_id)
|
||||
return GraphDatabase.driver(
|
||||
uri=connection_info.uri,
|
||||
auth=(connection_info.username, connection_info.password),
|
||||
)
|
||||
84
dags/task_01/tasks.py
Normal file
84
dags/task_01/tasks.py
Normal file
@@ -0,0 +1,84 @@
|
||||
import logging
|
||||
import os
|
||||
import tempfile
|
||||
from datetime import datetime
|
||||
from random import randint
|
||||
|
||||
import pandas as pd
|
||||
from airflow import AirflowException
|
||||
from airflow.models import Variable
|
||||
from task_01.target_db.repositories import TargetDBRepo
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class UserActionTransfer:
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
target_db: TargetDBRepo,
|
||||
):
|
||||
self.target_db = target_db
|
||||
self.log = logging.getLogger(__name__)
|
||||
|
||||
def generate_sample_data(self):
|
||||
"""Генерация тестовых данных и сохранение в CSV файл"""
|
||||
actions = self.get_fake_user_action_data()
|
||||
csv_file_path = self.get_csv_file_path()
|
||||
actions.to_csv(csv_file_path, index=False)
|
||||
|
||||
# Сохраняем путь к файлу в переменную DAG
|
||||
Variable.set("user_action_data_path", csv_file_path)
|
||||
|
||||
logging.info("Sample data generated and saved to: %s", csv_file_path)
|
||||
logging.info(f"Data preview: %s", actions.head())
|
||||
|
||||
def load_data_to_neo4j(self) -> None:
|
||||
csv_file_path = Variable.get("user_action_data_path")
|
||||
|
||||
if not csv_file_path or not os.path.exists(csv_file_path):
|
||||
raise AirflowException("CSV file not found: %s", csv_file_path)
|
||||
|
||||
# Чтение CSV файла
|
||||
user_actions = pd.read_csv(csv_file_path)
|
||||
logging.info("Loaded CSV data with %s rows", len(user_actions))
|
||||
|
||||
self.target_db.save_users(user_actions)
|
||||
total_rows = self.target_db.get_number_of_users()
|
||||
logging.info("Total rows: %s", total_rows)
|
||||
|
||||
# Очистка временного файла
|
||||
os.remove(csv_file_path)
|
||||
logging.info("Temporary CSV file cleaned up")
|
||||
|
||||
def check_neo4j_connection(self):
|
||||
"""Проверка соединения с БД"""
|
||||
try:
|
||||
result = self.target_db.check_connection()
|
||||
log.info(f"Neo4j message: {result}")
|
||||
log.info("Neo4j connection is healthy")
|
||||
except Exception as e:
|
||||
log.error(f"Neo4j connection failed: {e}")
|
||||
raise
|
||||
return result
|
||||
|
||||
@staticmethod
|
||||
def get_csv_file_path() -> str:
|
||||
temp_dir = tempfile.gettempdir()
|
||||
return os.path.join(temp_dir, "user_action_data.csv")
|
||||
|
||||
@staticmethod
|
||||
def get_fake_user_action_data() -> pd.DataFrame:
|
||||
actions = ["login", "purchase", "view", "logout", "search"]
|
||||
ids = list(range(1, 10001))
|
||||
action = []
|
||||
timestamp = []
|
||||
for _ in ids:
|
||||
action.append(actions[randint(0, len(actions) - 1)])
|
||||
timestamp.append(datetime.now())
|
||||
sample_data = {
|
||||
"user_id": ids,
|
||||
"action": action,
|
||||
"timestamp": timestamp
|
||||
}
|
||||
return pd.DataFrame(sample_data)
|
||||
Reference in New Issue
Block a user