#!/bin/bash

export GPUS_PER_NODE=8
export NCCL_DEBUG=INFO
export NCCL_IB_DISABLE=0
export NCCL_SOCKET_IFNAME=eth0
export NCCL_IB_HCA=ib7s

# export NCCL_DEBUG_SUBSYS=ALL
# export NCCL_DEBUG_FILE=nccl-log.%h.%p

FORCE_TORCHRUN=1 NNODES=2 NODE_RANK=0 MASTER_ADDR=172.19.116.123 MASTER_PORT=29500 \
llamafactory-cli train \
--stage sft \
--do_train True \
--model_name_or_path /shared-only/models/Qwen/Qwen3-1.7B \
--preprocessing_num_workers 16 \
--finetuning_type lora \
--template qwen3 \
--flash_attn auto \
--dataset_dir /workspace/llamafactory/data \
--dataset identity \
--cutoff_len 1024 \
--learning_rate 5e-05 \
--num_train_epochs 3.0 \
--max_samples 100000 \
--per_device_train_batch_size 2 \
--gradient_accumulation_steps 8 \
--lr_scheduler_type cosine \
--max_grad_norm 1.0 \
--logging_steps 5 \
--save_steps 100 \
--packing False \
--enable_thinking False \
--report_to none \
--output_dir /workspace/user-data/models/output/Qwen3-1.7B/lora/train_2026-02-02-19-30-20 \
--bf16 True \
--plot_loss True \
--trust_remote_code True \
--ddp_timeout 180000000 \
--include_num_input_tokens_seen True \
--warmup_ratio 0.1 \
--lora_rank 8 \
--lora_alpha 16 \
--lora_dropout 0 \
--lora_target all \
--use_swanlab True \
--swanlab_project llamafactory \
--swanlab_api_key GWw6*********1Kmod6Wb \
--swanlab_mode cloud 
