编辑
2026-04-01
大数据
00
bash
#!/bin/bash # hadoop-cluster-setup.sh # 优雅的 Hadoop 集群一键搭建脚本 # 特点:模块化设计、优雅的进度显示、详细日志、错误恢复、配置模板 # 版本:1.1.0 # 更严格的错误处理 set -Eeuo pipefail # 使用 -E 以便 trap 捕获 ERR 信号 trap 'handle_error $? $LINENO "$BASH_COMMAND"' ERR trap 'cleanup_on_exit' EXIT # 确保清理函数在退出时执行 # ==================== 配置区域 ==================== # 这里可以修改配置,也可以使用配置文件覆盖 # 集群配置 CLUSTER_NAME="widdonexus-hadoop-cluster" HADOOP_VERSION="3.3.6" JAVA_VERSION="11" # 集群节点配置(支持主机名或IP) MASTER_NODE="hadoop102" WORKER_NODES="hadoop103 hadoop104" ALL_NODES="$MASTER_NODE $WORKER_NODES" # 服务分配(根据您的规划) # hadoop102: NameNode + DataNode # hadoop103: DataNode + ResourceManager + NodeManager + JobHistory # hadoop104: DataNode + SecondaryNameNode + NodeManager NAMENODE_NODE="hadoop102" RESOURCEMANAGER_NODE="hadoop103" SECONDARY_NODE="hadoop104" JOBHISTORY_NODE="hadoop103" # 用户配置 HADOOP_USER="hadoop" HADOOP_GROUP="hadoop" HADOOP_PASSWORD="widdonexus@hadoop" # 目录配置 HADOOP_HOME="/opt/module/hadoop${HADOOP_VERSION}" JAVA_HOME="/opt/module/java-$JAVA_VERSION-openjdk" DATA_DIR="/data/hadoop" LOG_DIR="/var/log/hadoop" PID_DIR="/var/run/hadoop" # 网络配置(端口规划) PORTS=( "9820:NameNode RPC" "9870:NameNode HTTP" "9866:DataNode RPC" "9864:DataNode HTTP" "9868:SecondaryNameNode HTTP" "8088:ResourceManager HTTP" "8042:NodeManager HTTP" "19888:JobHistory HTTP" "10020:JobHistory RPC" ) # 下载镜像(可选国内镜像) #HADOOP_MIRROR="https://dlcdn.apache.org/hadoop/common" # 国内镜像: HADOOP_MIRROR="https://mirrors.bfsu.edu.cn/apache/hadoop/common" # 颜色定义 BLACK='\033[0;30m' RED='\033[0;31m' GREEN='\033[0;32m' YELLOW='\033[0;33m' BLUE='\033[0;34m' MAGENTA='\033[0;35m' CYAN='\033[0;36m' WHITE='\033[0;37m' BOLD='\033[1m' DIM='\033[2m' ITALIC='\033[3m' UNDERLINE='\033[4m' BLINK='\033[5m' REVERSE='\033[7m' HIDDEN='\033[8m' NC='\033[0m' # No Color # 全局变量 LOG_FILE="/tmp/hadoop-setup-$(date +%Y%m%d-%H%M%S).log" CONFIG_FILE="/tmp/hadoop-config-$(date +%s).conf" STEP=0 TOTAL_STEPS=15 SUCCESS_COUNT=0 ERROR_COUNT=0 START_TIME=$(date +%s) # ==================== 美观的输出函数 ==================== # 打印横幅 print_banner() { clear echo -e "${BLUE}${BOLD}" echo "╔══════════════════════════════════════════════════════════╗" echo "║ ║" echo "║ ██╗ ██╗ █████╗ ██████╗ ██████╗ ██████╗ ██████╗ ║" echo "║ ██║ ██║██╔══██╗██╔══██╗██╔═══██╗██╔═══██╗██╔══██╗ ║" echo "║ ███████║███████║██║ ██║██║ ██║██║ ██║██████╔╝ ║" echo "║ ██╔══██║██╔══██║██║ ██║██║ ██║██║ ██║██╔═══╝ ║" echo "║ ██║ ██║██║ ██║██████╔╝╚██████╔╝╚██████╔╝██║ ║" echo "║ ╚═╝ ╚═╝╚═╝ ╚═╝╚═════╝ ╚═════╝ ╚═════╝ ╚═╝ ║" echo "║ ║" echo "║ v1.1.0 ║" echo "║ widdonexus ║" echo "╚══════════════════════════════════════════════════════════╝" echo -e "${NC}" } # 显示主菜单 show_main_menu() { print_banner echo -e "${YELLOW}${BOLD}请选择操作模式:${NC}" echo -e " ${GREEN}[1]${NC} ${BOLD}安装集群${NC} - 全新安装或重新安装Hadoop集群" echo -e " ${RED}[2]${NC} ${BOLD}卸载集群${NC} - 完全卸载现有Hadoop集群" echo -e " ${CYAN}[3]${NC} ${BOLD}检查状态${NC} - 检查集群运行状态" echo -e " ${GREEN}[4]${NC} ${BOLD}启动集群${NC} - 启动Hadoop所有服务" echo -e " ${RED}[5]${NC} ${BOLD}停止集群${NC} - 停止Hadoop所有服务" echo -e " ${YELLOW}[6]${NC} ${BOLD}修复集群${NC} - 修复Java环境和数据目录问题" echo -e " ${BLUE}[7]${NC} ${BOLD}退出脚本${NC}" echo -e "\n${YELLOW}请输入选择 (1-7): ${NC}\c" read -r main_choice case $main_choice in 1) MODE="install" ;; 2) MODE="uninstall" ;; 3) MODE="status" ;; 4) MODE="start" ;; 5) MODE="stop" ;; 6) MODE="fix" ;; 7) echo -e "${GREEN}退出脚本。${NC}" exit 0 ;; *) print_error "无效选择,默认使用安装模式" MODE="install" ;; esac } # 集群状态检查(新增) check_cluster_status() { print_banner print_step "集群状态检查" "检查Hadoop集群运行状态" echo -e "\n${CYAN}${BOLD}📊 集群基本信息:${NC}" echo -e " 集群名称: ${YELLOW}$CLUSTER_NAME${NC}" echo -e " Hadoop版本: ${YELLOW}$HADOOP_VERSION${NC}" echo -e " 运行用户: ${YELLOW}$HADOOP_USER${NC}" echo -e "\n${CYAN}${BOLD}🔍 节点状态:${NC}" local alive_nodes=0 local total_nodes=0 for node in $ALL_NODES; do ((total_nodes++)) if ping -c 1 -W 1 "$node" &> /dev/null; then ((alive_nodes++)) echo -e " ${GREEN}${NC} $node - 在线" else echo -e " ${RED}${NC} $node - 离线" fi done echo -e "\n${CYAN}${BOLD}🔄 服务状态:${NC}" # 检查HDFS if command -v hdfs &> /dev/null; then if hdfs dfsadmin -report 2>/dev/null | grep -q "Live datanodes"; then local datanodes=$(hdfs dfsadmin -report 2>&1 | grep "Live datanodes" | awk '{print $3}') echo -e " ${GREEN}${NC} HDFS - 运行中 (DataNodes: $datanodes)" else echo -e " ${RED}${NC} HDFS - 未运行" fi else echo -e " ${YELLOW}${NC} HDFS - 命令不可用" fi # 检查YARN if command -v yarn &> /dev/null; then if yarn node -list 2>/dev/null | grep -q "Total Nodes"; then local yarn_nodes=$(yarn node -list 2>&1 | grep "Total Nodes" | awk '{print $3}') echo -e " ${GREEN}${NC} YARN - 运行中 (Nodes: $yarn_nodes)" else echo -e " ${RED}${NC} YARN - 未运行" fi else echo -e " ${YELLOW}${NC} YARN - 命令不可用" fi echo -e "\n${CYAN}${BOLD}🌐 Web UI访问:${NC}" echo -e " NameNode: http://${NAMENODE_NODE}:9870" echo -e " ResourceManager: http://${RESOURCEMANAGER_NODE}:8088" echo -e "\n${GREEN}${BOLD}══════════════════════════════════════════════════════════${NC}" echo -e " 在线节点: ${alive_nodes}/${total_nodes}" echo -e " 检查时间: $(date '+%Y-%m-%d %H:%M:%S')" echo -e "${GREEN}${BOLD}══════════════════════════════════════════════════════════${NC}" echo -e "\n${YELLOW}按任意键返回主菜单...${NC}" read -n1 -s } # 打印步骤信息 print_step() { #((STEP++)) STEP=$((STEP + 1)) local step_msg="$1" local description="$2" echo -e "\n${CYAN}${BOLD}[步骤 $STEP/$TOTAL_STEPS] ${NC}${step_msg}" echo -e "${DIM}${description}${NC}" echo -e "${BLUE}┌─────────────────────────────────────────────────────${NC}" } # 打印完成信息 print_step_complete() { echo -e "${BLUE}└─────────────────────────────────────────────────────${NC}" } # 打印成功信息 print_success() { local message="$1" echo -e " ${GREEN}${NC} ${message}" #((SUCCESS_COUNT++)) SUCCESS_COUNT=$((SUCCESS_COUNT + 1)) echo "$(date +'%Y-%m-%d %H:%M:%S') [SUCCESS] $message" >> "$LOG_FILE" } # 打印警告信息 print_warning() { local message="$1" echo -e " ${YELLOW}${NC} ${message}" echo "$(date +'%Y-%m-%d %H:%M:%S') [WARNING] $message" >> "$LOG_FILE" } # 打印错误信息 print_error() { local message="$1" echo -e " ${RED}${NC} ${message}" ERROR_COUNT=$((ERROR_COUNT + 1)) echo "$(date +'%Y-%m-%d %H:%M:%S') [ERROR] $message" >> "$LOG_FILE" } # 打印信息 print_info() { local message="$1" echo -e " ${BLUE}${NC} ${message}" echo "$(date +'%Y-%m-%d %H:%M:%S') [INFO] $message" >> "$LOG_FILE" } # 打印进度条 print_progress() { local current="$1" local total="$2" local width=50 local percentage=$((current * 100 / total)) local completed=$((width * current / total)) local remaining=$((width - completed)) printf "\r${BLUE}[" printf "%${completed}s" | tr ' ' '=' printf "%${remaining}s" | tr ' ' ' ' printf "] ${percentage}%%${NC}" } # 优雅的Spinner spinner() { local pid=$1 local delay=0.1 local spinstr='|/-\' while [ "$(ps a | awk '{print $1}' | grep $pid)" ]; do local temp=${spinstr#?} printf " [%c] " "$spinstr" local spinstr=$temp${spinstr%"$temp"} sleep $delay printf "\b\b\b\b\b\b" done printf " \b\b\b\b" } # ==================== 错误处理函数 ==================== # 错误处理函数 handle_error() { local exit_code=$1 local line_no=$2 local command=$3 #local error_msg="脚本在行号 $line_no 处退出,退出码: $exit_code" # 如果是在清理过程中出错,直接退出 if [[ "$command" == *cleanup_on_exit* ]]; then echo -e "${RED}清理过程中出错,强制退出${NC}" exit $exit_code fi echo -e "\n${RED}${BOLD}══════════════════════════════════════════════════════════${NC}" echo -e "${RED}${BOLD} 错误发生!${NC}" echo -e "${RED}退出码: $exit_code${NC}" echo -e "${RED}行号: $line_no${NC}" echo -e "${RED}命令: $command${NC}" #echo -e "${RED}错误信息: ${error_msg}${NC}" echo -e "${RED}详细日志请查看: $LOG_FILE${NC}" echo -e "${RED}${BOLD}══════════════════════════════════════════════════════════${NC}" # 显示日志文件的最后10行 echo -e "\n${YELLOW}日志最后10行:${NC}" tail -10 "$LOG_FILE" 2>/dev/null || echo "无法读取日志文件" # # 询问是否继续 # echo -e "\n${YELLOW}是否尝试继续执行?(y/n): ${NC}\c" # read -r continue_choice # if [[ "$continue_choice" =~ ^[Yy]$ ]]; then # return 0 # else # cleanup_on_exit # exit $exit_code # fi # 直接退出,不询问 cleanup_on_exit exit $exit_code } # 安全执行函数(用于需要继续执行的模块) safe_execute() { local module_name="$1" local module_function="$2" #echo -e "\n${BLUE}执行模块: $module_name${NC}" # 临时禁用 ERR trap trap '' ERR # 执行模块 if $module_function; then #echo -e "${GREEN}✓ 模块 $module_name 执行成功${NC}" trap 'handle_error $? $LINENO "$BASH_COMMAND"' ERR # 重新启用 trap return 0 else local exit_code=$? echo -e "${RED}✗ 模块 $module_name 执行失败 (退出码: $exit_code)${NC}" trap 'handle_error $? $LINENO "$BASH_COMMAND"' ERR # 重新启用 trap return $exit_code fi } # 清理函数 cleanup_on_exit() { local end_time=$(date +%s) local duration=$((end_time - START_TIME)) echo -e "\n${BLUE}${BOLD}══════════════════════════════════════════════════════════${NC}" echo -e "${GREEN}${BOLD} 执行摘要${NC}" echo -e "${BLUE}${BOLD}══════════════════════════════════════════════════════════${NC}" echo -e " 总步骤: $TOTAL_STEPS" echo -e " 成功: ${GREEN}$SUCCESS_COUNT${NC}" echo -e " 错误: ${RED}$ERROR_COUNT${NC}" echo -e " 耗时: ${YELLOW}${duration}${NC}" echo -e " 日志文件: $LOG_FILE" echo -e " 配置备份: $CONFIG_FILE" echo -e "${BLUE}${BOLD}══════════════════════════════════════════════════════════${NC}" } # 配置验证函数 validate_config() { #print_step "验证配置" "检查所有必需的配置参数" # 检查必要参数 local missing_params=() [ -z "$MASTER_NODE" ] && missing_params+=("MASTER_NODE") [ -z "$WORKER_NODES" ] && missing_params+=("WORKER_NODES") [ -z "$HADOOP_USER" ] && missing_params+=("HADOOP_USER") [ -z "$HADOOP_HOME" ] && missing_params+=("HADOOP_HOME") [ -z "$HADOOP_VERSION" ] && missing_params+=("HADOOP_VERSION") if [ ${#missing_params[@]} -gt 0 ]; then print_error "缺少必需的配置参数: ${missing_params[*]}" exit 1 fi #print_success "配置验证通过" #print_step_complete } # ==================== 新增卸载模块 ==================== # 模块A:卸载确认 module_uninstall_confirm() { print_step "卸载确认" "确认卸载Hadoop集群操作" echo -e "${RED}${BOLD}══════════════════════════════════════════════════════════${NC}" echo -e "${RED}${BOLD} ⚠️ 警告:危险操作! ${NC}" echo -e "${RED}${BOLD}══════════════════════════════════════════════════════════${NC}" echo -e "\n${YELLOW}${BOLD}此操作将永久删除以下内容:${NC}" echo -e " ${RED}• Hadoop安装目录: $HADOOP_HOME${NC}" echo -e " ${RED}• 所有数据目录: $DATA_DIR${NC}" echo -e " ${RED}• 日志和PID目录: $LOG_DIR, $PID_DIR${NC}" echo -e " ${RED}• 所有配置文件和环境变量${NC}" echo -e " ${RED}• 集群所有节点上的Hadoop相关文件${NC}" echo -e "\n${YELLOW}${BOLD}影响范围:${NC}" for node in $ALL_NODES; do echo -e " ${CYAN}• $node${NC}" done # 确认选择 echo -e "\n${YELLOW}${BOLD}请选择操作:${NC}" echo -e " 1) ${GREEN}取消卸载${NC} - 返回主菜单" echo -e " 2) ${YELLOW}普通卸载${NC} - 删除Hadoop文件,保留用户和目录" echo -e " 3) ${RED}完全卸载${NC} - 删除所有相关文件,包括用户" echo -e "\n${YELLOW}请输入选择 (1/2/3): ${NC}\c" read -r uninstall_choice case $uninstall_choice in 1) print_info "卸载操作已取消" return 1 ;; 2) UNINSTALL_LEVEL="normal" print_info "选择普通卸载模式" ;; 3) UNINSTALL_LEVEL="full" print_info "选择完全卸载模式" ;; *) print_error "无效选择,默认使用普通卸载" UNINSTALL_LEVEL="normal" ;; esac # 最终确认 if [ "$CONFIRM_BEFORE_REMOVE" = true ]; then echo -e "\n${RED}${BOLD}最后一次确认!${NC}" echo -e "${RED}请输入 'YES, DELETE HADOOP' 以确认卸载:${NC}\c" read -r final_confirm if [ "$final_confirm" != "YES, DELETE HADOOP" ]; then print_info "卸载操作已取消" return 1 fi fi print_success "确认完成,开始卸载流程" print_step_complete return 0 } # 模块B:停止所有服务 module_stop_services() { print_step "停止服务" "停止所有Hadoop集群服务" # 停止服务的函数 stop_hadoop_services() { print_info "停止当前节点的Hadoop服务..." # 停止JobHistory Server if pgrep -f "historyserver" > /dev/null; then sudo -u "$HADOOP_USER" ssh "$HADOOP_USER@$JOBHISTORY_NODE" bash -c ' # 加载hadoop用户的环境 if [ -f ~/.bashrc ]; then source ~/.bashrc fi if [ -f /etc/profile.d/hadoop.sh ]; then source /etc/profile.d/hadoop.sh fi # 启动JobHistory Server "$HADOOP_HOME/bin/mapred" --daemon stop historyserver ' 2>/dev/null || true print_info "停止JobHistory Server" fi # 停止YARN if [ -f "$HADOOP_HOME/sbin/stop-yarn.sh" ]; then sudo -u "$HADOOP_USER" ssh "$HADOOP_USER@$RESOURCEMANAGER_NODE" bash -c ' # 加载hadoop用户的环境 if [ -f ~/.bashrc ]; then source ~/.bashrc fi if [ -f /etc/profile.d/hadoop.sh ]; then source /etc/profile.d/hadoop.sh fi "$HADOOP_HOME/sbin/stop-yarn.sh" ' 2>/dev/null || true print_info "停止YARN服务" fi # 停止HDFS if [ -f "$HADOOP_HOME/sbin/stop-dfs.sh" ]; then sudo -u "$HADOOP_USER" ssh "$HADOOP_USER@$NAMENODE_NODE" bash -c ' # 加载hadoop用户的环境 if [ -f ~/.bashrc ]; then source ~/.bashrc fi if [ -f /etc/profile.d/hadoop.sh ]; then source /etc/profile.d/hadoop.sh fi "$HADOOP_HOME/sbin/stop-dfs.sh" ' 2>/dev/null || true print_info "停止HDFS服务" fi # 停止所有节点上的服务 for node in $ALL_NODES; do if [ "$node" != "$(hostname)" ]; then print_info "停止节点 $node 的服务..." sudo -u "$HADOOP_USER" ssh "$HADOOP_USER@$node" bash -c ' # 停止SecondaryNameNode $HADOOP_HOME/bin/hdfs --daemon stop secondarynamenode 2>/dev/null || true # 停止DataNode $HADOOP_HOME/bin/hdfs --daemon stop datanode 2>/dev/null || true # 停止NodeManager $HADOOP_HOME/bin/yarn --daemon stop nodemanager 2>/dev/null || true # 杀死残留进程 pkill -u $HADOOP_USER -f hadoop 2>/dev/null || true pkill -u $HADOOP_USER -f yarn 2>/dev/null || true pkill -u $HADOOP_USER -f hdfs 2>/dev/null || true ' 2>/dev/null || print_warning "节点 $node 服务停止时出现警告" fi done # 等待进程停止 sleep 3 # 强制终止残留进程 for node in $ALL_NODES; do sudo -u "$HADOOP_USER" ssh "$HADOOP_USER@$node" "pkill -9 -u $HADOOP_USER -f 'hadoop|yarn|hdfs'" 2>/dev/null || true 2>/dev/null done } # 安全执行停止服务 if safe_execute "停止Hadoop服务" stop_hadoop_services; then print_success "所有Hadoop服务已停止" else print_warning "部分服务可能未完全停止" fi print_step_complete } # 模块C:删除文件和目录 module_remove_files() { print_step "删除文件" "删除Hadoop相关文件和目录" # 设置默认值(如果变量未定义) local BACKUP_BEFORE_REMOVE=${BACKUP_BEFORE_REMOVE:-false} # 创建备份(可选) if [ "$BACKUP_BEFORE_REMOVE" = true ]; then local backup_dir="/tmp/hadoop-backup-$(date +%Y%m%d-%H%M%S)" sudo mkdir -p "$backup_dir" print_info "创建配置文件备份: $backup_dir" sudo cp -r "$HADOOP_HOME/etc/hadoop" "$backup_dir/config" 2>/dev/null || true sudo cp -r "/etc/profile.d/hadoop.sh" "$backup_dir/" 2>/dev/null || true fi # 删除本地文件 print_info "删除本地文件..." local items_to_remove=( "$HADOOP_HOME" "$DATA_DIR" "$LOG_DIR" "$PID_DIR" "/tmp/hadoop-*" "/tmp/hsperfdata_$HADOOP_USER" "/tmp/jetty_*" ) for item in "${items_to_remove[@]}"; do if [ -e "$item" ] || [ "$item" == *"*"* ]; then sudo rm -rf $item 2>/dev/null || true print_info "删除: $item" fi done # 删除所有节点上的文件 for node in $ALL_NODES; do print_info "清理节点: $node" sudo -u "$HADOOP_USER" ssh "$HADOOP_USER@$node" " set -e # 删除Hadoop目录 if [ -d '$HADOOP_HOME' ]; then sudo rm -rf '$HADOOP_HOME' echo '删除Hadoop安装目录' fi # 删除数据目录 if [ -d '$DATA_DIR' ]; then sudo rm -rf '$DATA_DIR' echo '删除数据目录' fi # 删除日志和PID目录 if [ -d '$LOG_DIR' ]; then sudo rm -rf '$LOG_DIR' echo '删除日志目录' fi if [ -d '$PID_DIR' ]; then sudo rm -rf '$PID_DIR' echo '删除PID目录' fi # 清理临时文件 sudo rm -rf /tmp/hadoop-* /tmp/hsperfdata_$HADOOP_USER /tmp/Jetty_* 2>/dev/null || true sudo rm -f /tmp/*.pid /tmp/*.out 2>/dev/null || true # 删除环境变量文件 sudo rm -f /etc/profile.d/hadoop.sh 2>/dev/null || true sudo rm -f /etc/sudoers.d/hadoop-$HADOOP_USER 2>/dev/null || true # 清理.bashrc中的Hadoop配置 sudo sed -i '/HADOOP_HOME/d' /home/$HADOOP_USER/.bashrc 2>/dev/null || true sudo sed -i '/JAVA_HOME/d' /home/$HADOOP_USER/.bashrc 2>/dev/null || true sudo sed -i '/HADOOP_CONF_DIR/d' /home/$HADOOP_USER/.bashrc 2>/dev/null || true " 2>/dev/null || print_warning "节点 $node 清理时出现警告" done # 删除本地的环境变量文件 sudo rm -f /etc/profile.d/hadoop.sh 2>/dev/null || true sudo rm -f /etc/sudoers.d/hadoop-$HADOOP_USER 2>/dev/null || true print_success "文件和目录删除完成" print_step_complete } # 模块D:清理用户和组(仅完全卸载) module_clean_users() { if [ "$UNINSTALL_LEVEL" != "full" ]; then print_info "跳过用户清理(普通卸载模式)" return 0 fi print_step "清理用户" "删除Hadoop用户和组" for node in $ALL_NODES; do print_info "清理节点 $node 的用户..." sudo -u "$HADOOP_USER" ssh "$HADOOP_USER@$node" " set -e # 检查用户是否存在 if id '$HADOOP_USER' &> /dev/null; then # 检查用户主目录是否为空 if [ -d '/home/$HADOOP_USER' ]; then file_count=\$(sudo find /home/$HADOOP_USER -maxdepth 1 -type f 2>/dev/null | wc -l) dir_count=\$(sudo find /home/$HADOOP_USER -maxdepth 1 -type d 2>/dev/null | wc -l) if [ \$file_count -eq 0 ] && [ \$dir_count -le 1 ]; then # 主目录基本为空,可以删除用户 sudo userdel -r '$HADOOP_USER' 2>/dev/null || true echo '删除用户: $HADOOP_USER' # 删除用户组(如果没有其他成员) if getent group '$HADOOP_GROUP' &> /dev/null; then group_members=\$(getent group '$HADOOP_GROUP' | cut -d: -f4) if [ -z \"\$group_members\" ]; then sudo groupdel '$HADOOP_GROUP' 2>/dev/null || true echo '删除组: $HADOOP_GROUP' else echo '组 $HADOOP_GROUP 仍有其他成员,保留' fi fi else echo '用户主目录非空,保留用户' echo '您可以手动清理: sudo rm -rf /home/$HADOOP_USER' fi fi else echo '用户 $HADOOP_USER 不存在' fi " 2>/dev/null || print_warning "节点 $node 用户清理时出现警告" done print_success "用户和组清理完成" print_step_complete } # 模块E:清理防火墙规则 module_clean_firewall() { print_step "清理防火墙" "删除Hadoop相关防火墙规则" # 需要开放的端口列表 local ports=("9820" "9870" "9866" "9864" "9868" "8088" "8042" "19888" "10020") for node in $ALL_NODES; do print_info "清理节点 $node 的防火墙规则..." sudo -u "$HADOOP_USER" ssh "$HADOOP_USER@$node" " set -e # 检测防火墙类型 if command -v ufw &> /dev/null; then # Ubuntu/Debian: ufw for port in ${ports[@]}; do sudo ufw delete allow \"\$port/tcp\" 2>/dev/null || true done sudo ufw reload 2>/dev/null || true echo 'UFW防火墙规则已清理' elif command -v firewall-cmd &> /dev/null; then # CentOS/RHEL: firewalld for port in ${ports[@]}; do sudo firewall-cmd --permanent --remove-port=\"\$port/tcp\" 2>/dev/null || true done sudo firewall-cmd --reload 2>/dev/null || true echo 'Firewalld防火墙规则已清理' elif command -v iptables &> /dev/null; then # 传统iptables echo '检测到iptables,防火墙规则需要手动清理' echo '相关端口: ${ports[@]}' else echo '未检测到防火墙工具' fi " 2>/dev/null || print_warning "节点 $node 防火墙清理时出现警告" done print_success "防火墙规则清理完成" print_step_complete } # 模块F:卸载完成验证 module_uninstall_verify() { print_step "验证卸载" "验证Hadoop集群已完全卸载" # 停止脚本自身的后台进程 print_info "清理脚本相关进程..." # 杀死可能存在的僵尸ssh进程 pkill -f "hadoop" 2>/dev/null || true local verification_passed=0 local verification_total=0 print_info "检查各节点卸载情况..." for node in $ALL_NODES; do ((verification_total++)) # 检查Hadoop目录是否存在 if sudo -u "$HADOOP_USER" ssh "$HADOOP_USER@$node" "[ ! -d '$HADOOP_HOME' ]" 2>/dev/null; then ((verification_passed++)) print_success "节点 $node: Hadoop目录已删除 ✓" else print_warning "节点 $node: Hadoop目录可能还存在" fi # 检查进程是否还在运行 #local process_count=$(ssh "$HADOOP_USER@$node" "pgrep -u $HADOOP_USER -f 'hadoop|yarn|hdfs' 2>/dev/null | wc -l" 2>/dev/null || echo "0") local process_count=$(sudo -u "$HADOOP_USER" ssh "$HADOOP_USER@$node" "pgrep -f 'hadoop|yarn|hdfs' 2>/dev/null | wc -l" 2>/dev/null || echo "0") if [ "$process_count" -eq 0 ]; then print_success "节点 $node: 无Hadoop进程运行 ✓" else print_warning "节点 $node: 仍有 $process_count 个Hadoop进程在运行" fi done # 检查本地 if [ ! -d "$HADOOP_HOME" ]; then ((verification_passed++)) print_success "本地Hadoop目录已删除 ✓" else print_warning "本地Hadoop目录仍然存在" fi local BACKUP_BEFORE_REMOVE=${BACKUP_BEFORE_REMOVE:-false} if [ $verification_passed -eq $((verification_total + 1)) ]; then echo -e "\n${GREEN}${BOLD}══════════════════════════════════════════════════════════${NC}" echo -e "${GREEN}${BOLD} 🎉 Hadoop集群卸载完成! ${NC}" echo -e "${GREEN}${BOLD}══════════════════════════════════════════════════════════${NC}" if [ "$BACKUP_BEFORE_REMOVE" = true ] && [ -d "$backup_dir" ]; then echo -e "${YELLOW}配置文件备份位于: $backup_dir${NC}" echo -e "${YELLOW}请在确认不再需要后手动删除备份${NC}" fi echo -e "\n${CYAN}${BOLD}📋 卸载完成总结:${NC}" echo -e " 删除级别: ${YELLOW}$UNINSTALL_LEVEL${NC}" echo -e " 清理节点数: ${YELLOW}${verification_total}${NC}" echo -e " 验证通过: ${GREEN}${verification_passed}/$((verification_total + 1))${NC}" if [ "$UNINSTALL_LEVEL" = "normal" ]; then echo -e "\n${BLUE}${BOLD}💡 注意:${NC}" echo -e " Hadoop用户和组仍保留,便于重新安装" echo -e " 如需完全清理,请选择'完全卸载'模式" fi echo -e "\n${GREEN}现在您可以重新运行安装脚本创建新的集群。${NC}" else print_warning "卸载基本完成,但建议手动检查以下项目:" echo -e " 1. 检查所有节点的 $HADOOP_HOME 目录" echo -e " 2. 检查所有节点的Hadoop相关进程" echo -e " 3. 检查环境变量配置" echo -e " 4. 检查防火墙规则" fi print_step_complete } # 显示卸载警告 show_uninstall_warning() { echo -e "${RED}${BOLD}══════════════════════════════════════════════════════════${NC}" echo -e "${RED}${BOLD} ⚠️ 警告:危险操作! ${NC}" echo -e "${RED}${BOLD}══════════════════════════════════════════════════════════${NC}" echo -e "\n${YELLOW}${BOLD}此操作将永久删除以下内容:${NC}" echo -e " ${RED}• Hadoop安装目录: $HADOOP_HOME${NC}" echo -e " ${RED}• 所有数据目录: $DATA_DIR${NC}" echo -e " ${RED}• 日志和PID目录: $LOG_DIR, $PID_DIR${NC}" echo -e " ${RED}• 所有配置文件和环境变量${NC}" echo -e " ${RED}• 集群所有节点上的Hadoop相关文件${NC}" echo -e "\n${YELLOW}${BOLD}影响范围:${NC}" for node in $ALL_NODES; do echo -e " ${CYAN}• $node${NC}" done } # 获取用户确认 get_uninstall_confirmation() { echo -e "\n${YELLOW}${BOLD}请选择操作:${NC}" echo -e " 1) ${GREEN}取消卸载${NC} - 返回主菜单" echo -e " 2) ${YELLOW}普通卸载${NC} - 删除Hadoop文件,保留用户和目录" echo -e " 3) ${RED}完全卸载${NC} - 删除所有相关文件,包括用户" echo -e "\n${YELLOW}请输入选择 (1/2/3): ${NC}\c" read -r uninstall_choice case $uninstall_choice in 1) return 1 ;; 2) UNINSTALL_LEVEL="normal" echo -e "${YELLOW}选择普通卸载模式${NC}" ;; 3) UNINSTALL_LEVEL="full" echo -e "${RED}选择完全卸载模式${NC}" ;; *) echo -e "${RED}无效选择,默认使用普通卸载${NC}" UNINSTALL_LEVEL="normal" ;; esac # 最终确认 local confirm_before_remove=${CONFIRM_BEFORE_REMOVE:-true} if [ "$confirm_before_remove" = true ]; then echo -e "\n${RED}${BOLD}最后一次确认!${NC}" echo -e "${RED}请输入 'YES, DELETE HADOOP' 以确认卸载:${NC}\c" read -r final_confirm if [ "$final_confirm" != "YES, DELETE HADOOP" ]; then return 1 fi fi return 0 } # ==================== 新增卸载主函数 ==================== uninstall_hadoop_cluster() { echo -e "\n${BLUE}开始执行Hadoop集群卸载...${NC}" # 显示警告信息 show_uninstall_warning # 获取用户确认 if ! get_uninstall_confirmation; then echo -e "${GREEN}卸载已取消。${NC}" return 0 fi echo -e "\n${DIM}详细日志将保存到: $LOG_FILE${NC}" echo -e "${BLUE}${BOLD}══════════════════════════════════════════════════════════${NC}" # 记录开始时间 local start_time=$(date +%s) # 执行卸载流程 ( # 重置步骤计数器 STEP=0 TOTAL_STEPS=5 # 停止所有服务 module_stop_services # 删除文件和目录 module_remove_files # 清理用户和组(仅完全卸载) module_clean_users # 清理防火墙规则 module_clean_firewall # 卸载完成验证 module_uninstall_verify ) 2>&1 | tee -a "$LOG_FILE" || { echo "警告:日志记录可能不完整,但卸载过程继续..." >&2 } # 显示完成信息 local end_time=$(date +%s) local duration=$((end_time - start_time)) echo -e "\n${GREEN}${BOLD}✅ 卸载完成!耗时: ${duration}${NC}" } # ==================== 核心功能模块 ==================== # 模块1:系统准备 module_system_prepare() { print_step "系统准备" "更新系统并安装基础依赖" # 检查操作系统 if [ -f /etc/os-release ]; then . /etc/os-release OS=$ID OS_VERSION=$VERSION_ID print_info "检测到操作系统: $NAME $VERSION" else print_warning "无法检测操作系统,假设为Ubuntu/Debian" OS="ubuntu" fi # 安装基础依赖(包括sshpass) case $OS in ubuntu|debian) sudo apt-get update && sudo apt-get upgrade -y sudo apt-get install -y ssh pdsh curl wget tar gnupg lsb-release \ net-tools dnsutils tree htop iotop iftop vim jq python3 python3-pip \ sshpass expect ;; centos|rhel|fedora) sudo yum update -y sudo yum install -y epel-release sudo yum install -y ssh pdsh curl wget tar gnupg2 redhat-lsb-core \ net-tools bind-utils tree htop iotop iftop vim jq python3 python3-pip \ sshpass expect ;; *) print_warning "不支持的操作系统,跳过系统更新" ;; esac print_success "系统准备完成" print_step_complete } # 模块2:用户和组管理(仅管理用户,不创建Hadoop目录) module_user_setup() { print_step "用户设置" "在所有节点创建Hadoop专用用户和组" # 定义要同步的用户配置 local hadoop_uid=1001 # 固定UID local hadoop_gid=1001 # 固定GID local hadoop_home="/home/$HADOOP_USER" local hadoop_shell="/bin/bash" local current_host=$(hostname) # 获取当前主机名 print_info "正在所有节点创建用户和组..." # 声明并初始化 count 变量 local count=0 for node in $ALL_NODES; do print_info "配置节点: $node" # 在节点上执行用户创建 ssh "$node" "bash -c ' set -e # 创建用户组(固定GID) if getent group \"$HADOOP_GROUP\" > /dev/null; then #echo \"用户组 $HADOOP_GROUP 已存在\" : else sudo groupadd -g $hadoop_gid \"$HADOOP_GROUP\" #echo \"创建用户组: $HADOOP_GROUP (GID:$hadoop_gid)\" fi # 创建用户(固定UID) if id \"$HADOOP_USER\" &> /dev/null; then #echo \"用户 $HADOOP_USER 已存在\" # 检查UID是否匹配 current_uid=\$(id -u \"$HADOOP_USER\") if [ \"\$current_uid\" != \"$hadoop_uid\" ]; then echo \"警告: 用户 $HADOOP_USER 的UID(\$current_uid)与配置($hadoop_uid)不匹配\" fi else sudo useradd -m -u $hadoop_uid -g $hadoop_gid \ -s \"$hadoop_shell\" -d \"$hadoop_home\" \"$HADOOP_USER\" #echo \"创建用户: $HADOOP_USER (UID:$hadoop_uid)\" #echo \"$HADOOP_USER:$HADOOP_PASSWORD\" | sudo chpasswd fi # 确保用户主目录存在并有正确权限 sudo mkdir -p \"$hadoop_home\" sudo chown -R \"$HADOOP_USER:$HADOOP_GROUP\" \"$hadoop_home\" sudo chmod 755 \"$hadoop_home\" # 创建.ssh目录用于后续SSH配置 sudo mkdir -p \"$hadoop_home/.ssh\" sudo chown -R \"$HADOOP_USER:$HADOOP_GROUP\" \"$hadoop_home/.ssh\" sudo chmod 700 \"$hadoop_home/.ssh\" # 配置sudo权限(无密码) sudo_file=\"/etc/sudoers.d/hadoop-$HADOOP_USER\" # 备份旧文件(如果存在) if [ -f \"\$sudo_file\" ]; then sudo cp \"\$sudo_file\" \"\$sudo_file.backup-$(date +%Y%m%d%H%M%S)\" #echo \"备份旧sudoers文件: \$sudo_file -> \$sudo_file.backup\" fi # 首先检查sudo环境下的yum路径 #echo \"检查sudo环境下的yum路径...\" sudo_yum_path=\$(sudo which yum 2>/dev/null || echo \"/bin/yum\") #echo \"sudo环境下的yum路径: \$sudo_yum_path\" # 检查/bin/yum是否存在(可能是一个符号链接) if [ -L \"/bin/yum\" ]; then yum_target=\$(readlink -f \"/bin/yum\") #echo \"/bin/yum 是指向 \$yum_target 的符号链接\" fi # 创建新的sudoers文件 - 使用sudo环境下的路径 # 注意:sudoers中不允许命令包含通配符参数,所以需要简化 sudo tee \"\$sudo_file\" > /dev/null << 'EOF' $HADOOP_USER ALL=(ALL) NOPASSWD:\ \$sudo_yum_path,\ /usr/bin/tee,\ /usr/bin/yum,\ /usr/bin/systemctl,\ /usr/bin/pkill,\ /usr/bin/pgrep,\ /opt/module/hadoop*/sbin/*,\ /opt/module/hadoop*/bin/*,\ /bin/cp,\ /bin/chown,\ /bin/chmod,\ /bin/mkdir,\ /bin/rm,\ /bin/mv,\ /bin/cat,\ /bin/grep,\ /bin/pgrep,\ /bin/sed,\ /bin/bash,\ /usr/sbin/useradd,\ /usr/sbin/groupadd,\ /usr/sbin/service,\ /bin/passwd EOF # 设置文件权限并验证语法 sudo chmod 440 \"\$sudo_file\" #echo \"配置sudo权限\" # 验证sudoers文件语法 if sudo visudo -c -f \"\$sudo_file\" 2>/dev/null; then #echo \"sudoers文件语法正确,权限已更新\" # 显示配置的权限(使用更简单的方法) #echo \"当前配置的sudo权限如下:\" #sudo cat \"\$sudo_file\" : else #echo \"错误:sudoers文件语法有问题,恢复备份\" if ls \"\$sudo_file.backup-\"* 2>/dev/null | head -1; then latest_backup=\$(ls -t \"\$sudo_file.backup-\"* | head -1) sudo mv \"\$latest_backup\" \"\$sudo_file\" #echo \"已恢复备份: \$latest_backup\" else sudo rm -f \"\$sudo_file\" #echo \"已删除错误的sudoers文件\" fi exit 1 fi # 显示hadoop用户的sudo权限 #echo \"测试hadoop用户的sudo权限:\" sudo -lU \"$HADOOP_USER\" 2>/dev/null | tail -2 || echo \"无法显示sudo权限,但文件已创建\" #echo \"节点 $node 用户配置完成\" '" & # 控制并发,避免同时创建太多用户 ((count++)) if [ $((count % 3)) -eq 0 ]; then wait fi done wait # 等待所有后台任务完成 # 验证所有节点的用户配置 print_info "验证所有节点的用户配置..." local verification_passed=0 local verification_total=0 for node in $ALL_NODES; do ((verification_total++)) # 区分本地节点和远程节点 if [ "$node" = "$current_host" ]; then # 本地节点:直接使用本地命令验证 if id -u "$HADOOP_USER" &> /dev/null; then local remote_uid=$(id -u "$HADOOP_USER") local remote_gid=$(id -g "$HADOOP_USER") if [ "$remote_uid" = "$hadoop_uid" ] && [ "$remote_gid" = "$hadoop_gid" ]; then ((verification_passed++)) print_success "节点 $node: UID=$remote_uid, GID=$remote_gid ✓" else print_warning "节点 $node: UID/GID不匹配 (UID=$remote_uid, GID=$remote_gid)" fi else print_error "节点 $node: 用户 $HADOOP_USER 不存在" fi else # 远程节点:使用ssh验证(使用当前用户而非hadoop用户) if ssh "$node" "id -u $HADOOP_USER" &> /dev/null; then local remote_uid=$(ssh "$node" "id -u $HADOOP_USER") local remote_gid=$(ssh "$node" "id -g $HADOOP_USER") if [ "$remote_uid" = "$hadoop_uid" ] && [ "$remote_gid" = "$hadoop_gid" ]; then ((verification_passed++)) print_success "节点 $node: UID=$remote_uid, GID=$remote_gid ✓" else print_warning "节点 $node: UID/GID不匹配 (UID=$remote_uid, GID=$remote_gid)" fi else print_error "节点 $node: 用户 $HADOOP_USER 不存在" fi fi done if [ $verification_passed -eq $verification_total ]; then print_success "所有节点用户配置验证通过 ($verification_passed/$verification_total)" else print_warning "部分节点用户配置需要检查 ($verification_passed/$verification_total)" fi print_step_complete } # 模块3:SSH密钥配置 module_ssh_setup() { print_step "SSH配置" "设置SSH免密登录" # 获取当前主机名 local current_host=$(hostname) # 生成SSH密钥(如果不存在) local ssh_dir="/home/$HADOOP_USER/.ssh" sudo mkdir -p "$ssh_dir" sudo chown -R "$HADOOP_USER:$HADOOP_GROUP" "$ssh_dir" sudo chmod 700 "$ssh_dir" # 生成密钥(如果不存在) if [ ! -f "$ssh_dir/id_rsa" ]; then echo -e "\n${YELLOW}生成SSH密钥,按Enter键接受默认设置...${NC}" sudo -u "$HADOOP_USER" ssh-keygen -t rsa -P '' -f "$ssh_dir/id_rsa" -q print_success "SSH密钥已生成" else print_info "SSH密钥已存在" fi # 创建authorized_keys文件并添加公钥 sudo -u "$HADOOP_USER" cat "$ssh_dir/id_rsa.pub" >> "$ssh_dir/authorized_keys" sudo chmod 600 "$ssh_dir/authorized_keys" # 配置SSH config文件,禁用严格主机密钥检查 sudo -u "$HADOOP_USER" cat > "$ssh_dir/config" << EOF Host * StrictHostKeyChecking no UserKnownHostsFile /dev/null LogLevel ERROR ConnectTimeout 30 ServerAliveInterval 60 ServerAliveCountMax 3 EOF sudo chmod 600 "$ssh_dir/config" # 第二步:使用expect脚本自动处理密码登录到其他节点 print_info "开始配置集群SSH免密登录..." # 定义配置节点的函数(不使用expect) configure_node_ssh_simple() { local node="$1" local password="$2" if [ "$node" = "$current_host" ]; then return 0 # 跳过当前节点 fi print_info "配置节点: $node" # 1. 测试连接并接受主机密钥 echo "首次连接,接受主机密钥..." sshpass -p "$password" ssh -o StrictHostKeyChecking=no "$HADOOP_USER@$node" "exit" 2>/dev/null || true # 2. 确保远程.ssh目录存在 sshpass -p "$password" ssh "$HADOOP_USER@$node" \ "mkdir -p ~/.ssh && chmod 700 ~/.ssh" 2>/dev/null # 3. 生成远程节点的密钥(如果不存在) sshpass -p "$password" ssh "$HADOOP_USER@$node" \ "if [ ! -f ~/.ssh/id_rsa ]; then ssh-keygen -t rsa -P '' -f ~/.ssh/id_rsa -q; fi" 2>/dev/null # 4. 获取远程节点的公钥 local remote_pubkey=$(sshpass -p "$password" ssh "$HADOOP_USER@$node" \ "cat ~/.ssh/id_rsa.pub 2>/dev/null" 2>/dev/null) if [ -n "$remote_pubkey" ]; then # 将远程节点的公钥添加到本地的authorized_keys echo "$remote_pubkey" | sudo -u "$HADOOP_USER" tee -a "$ssh_dir/authorized_keys" > /dev/null print_success "节点 $node 公钥已收集" fi # 5. 将本地公钥复制到远程节点 local local_pubkey=$(sudo -u "$HADOOP_USER" cat "$ssh_dir/id_rsa.pub") # 将本地公钥添加到远程节点的authorized_keys sshpass -p "$password" ssh "$HADOOP_USER@$node" " # 备份现有authorized_keys if [ -f ~/.ssh/authorized_keys ]; then cp ~/.ssh/authorized_keys ~/.ssh/authorized_keys.backup fi # 添加主节点的公钥 echo '$local_pubkey' > ~/.ssh/authorized_keys # 添加自己的公钥(如果不在文件中) if [ -f ~/.ssh/id_rsa.pub ]; then cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys fi # 设置权限 chmod 600 ~/.ssh/authorized_keys # 创建config文件 cat > ~/.ssh/config << 'CONFIGEOF' Host * StrictHostKeyChecking no UserKnownHostsFile /dev/null LogLevel ERROR ConnectTimeout 30 ServerAliveInterval 60 ServerAliveCountMax 3 CONFIGEOF chmod 600 ~/.ssh/config echo 'SSH配置完成' " 2>/dev/null return 0 } # 第三步:逐个配置其他节点 local failed_nodes=() for node in $ALL_NODES; do if [ "$node" != "$current_host" ]; then if configure_node_ssh_simple "$node" "$HADOOP_PASSWORD"; then print_success "节点 $node SSH配置成功" else print_warning "节点 $node SSH自动配置失败" failed_nodes+=("$node") fi fi done # 第四步:将当前节点收集的所有公钥合并并分发到所有节点 print_info "合并并分发公钥到所有节点..." # 创建合并的公钥文件 local merged_keys_file="/tmp/merged_keys_$(date +%s)" sudo -u "$HADOOP_USER" cat "$ssh_dir/authorized_keys" | sudo -u "$HADOOP_USER" sort -u > "$merged_keys_file" # 分发到所有节点 for node in $ALL_NODES; do if [ "$node" != "$current_host" ]; then print_info "同步公钥到节点: $node" if sshpass -p "$HADOOP_PASSWORD" scp -o StrictHostKeyChecking=no \ "$merged_keys_file" "$HADOOP_USER@$node:/tmp/merged_keys" 2>/dev/null; then sshpass -p "$HADOOP_PASSWORD" ssh "$HADOOP_USER@$node" " # 使用合并的公钥文件 cat /tmp/merged_keys > ~/.ssh/authorized_keys chmod 600 ~/.ssh/authorized_keys rm -f /tmp/merged_keys " 2>/dev/null print_success "节点 $node 公钥已同步" else print_warning "节点 $node 公钥同步失败" fi fi done # 清理临时文件 rm -f "$merged_keys_file" 2>/dev/null # 第五步:测试SSH免密登录 print_info "测试SSH免密登录..." local success_count=0 local total_tests=0 echo -e "\n${CYAN}${BOLD}🔗 SSH连接测试:${NC}" # 测试从当前节点到所有其他节点的连接 for node in $ALL_NODES; do if [ "$node" != "$current_host" ]; then ((total_tests++)) if sudo -u "$HADOOP_USER" ssh -o ConnectTimeout=5 -o BatchMode=yes \ "$node" "echo '从 $current_host$node 连接成功'" 2>/dev/null; then ((success_count++)) echo -e " ${GREEN}$current_host -> $node: ✓${NC}" else echo -e " ${RED}$current_host -> $node: ✗${NC}" fi fi done # 测试其他节点之间的连接(通过当前节点跳转) if [ -n "$WORKER_NODES" ]; then for src_node in $WORKER_NODES; do for dst_node in $ALL_NODES; do if [ "$src_node" != "$dst_node" ] && [ "$src_node" != "$current_host" ]; then ((total_tests++)) # 从工作节点ssh到其他节点 if sudo -u "$HADOOP_USER" ssh -o ConnectTimeout=5 -o BatchMode=yes \ "$src_node" "ssh -o ConnectTimeout=5 -o BatchMode=yes '$dst_node' 'echo 1'" 2>/dev/null; then ((success_count++)) echo -e " ${GREEN}$src_node -> $dst_node: ✓${NC}" else echo -e " ${RED}$src_node -> $dst_node: ✗${NC}" fi fi done done fi echo -e "\n${CYAN}${BOLD}📊 测试结果统计:${NC}" echo -e " 总连接数: $total_tests" echo -e " 成功连接: ${GREEN}$success_count${NC}" echo -e " 失败连接: ${RED}$((total_tests - success_count))${NC}" if [ $success_count -eq $total_tests ]; then print_success "所有节点间SSH免密登录配置成功!" else print_warning "部分节点间SSH免密登录需要进一步配置" # 提供简化版的手动配置指南 if [ ${#failed_nodes[@]} -gt 0 ]; then echo -e "\n${YELLOW}${BOLD}🔧 需要手动配置的节点:${NC}" for node in "${failed_nodes[@]}"; do echo -e " ${RED}• $node${NC}" done echo -e "\n${YELLOW}${BOLD}💡 手动配置步骤:${NC}" echo -e "1. 在主节点($current_host)执行:" echo -e " sudo -u $HADOOP_USER ssh-copy-id $HADOOP_USER@目标节点" echo -e " 或者" echo -e " sudo -u $HADOOP_USER cat ~/.ssh/id_rsa.pub | ssh $HADOOP_USER@目标节点 'cat >> ~/.ssh/authorized_keys'" echo -e "\n2. 在目标节点执行:" echo -e " chmod 600 ~/.ssh/authorized_keys" echo -e " chmod 700 ~/.ssh" fi fi print_step_complete } # 模块4:Java安装 module_java_install() { print_step "Java安装" "在所有集群节点安装Java $JAVA_VERSION 运行环境" local current_host=$(hostname) local installed_count=0 local verified_count=0 local total_nodes=0 # 根据操作系统设置默认路径 local default_java_home="" # 根据操作系统设置Java安装命令 local install_cmd="" case $OS in ubuntu|debian) install_cmd="sudo apt-get update -y && sudo apt-get install -y openjdk-${JAVA_VERSION}-jdk" default_java_home="/usr/lib/jvm/java-${JAVA_VERSION}-openjdk-amd64" ;; centos|rhel|fedora|rocky) install_cmd="sudo yum install -y java-${JAVA_VERSION}-openjdk-devel" default_java_home="/usr/lib/jvm/java-${JAVA_VERSION}-openjdk" ;; *) install_cmd="sudo yum install -y java-${JAVA_VERSION}-openjdk-devel" default_java_home="/usr/lib/jvm/java-${JAVA_VERSION}-openjdk" ;; esac print_info "Java安装命令: $install_cmd" # 安装Java的函数 install_java_on_node() { local node="$1" local node_os="$2" print_info "在节点 $node 上安装Java $JAVA_VERSION..." # 根据操作系统调整命令 local node_install_cmd="" case $node_os in ubuntu|debian) node_install_cmd="sudo apt-get update -y && sudo DEBIAN_FRONTEND=noninteractive apt-get install -y openjdk-${JAVA_VERSION}-jdk" ;; *) node_install_cmd="sudo yum install -y java-${JAVA_VERSION}-openjdk-devel" ;; esac # 执行安装 if sudo -u "$HADOOP_USER" ssh -tt "$HADOOP_USER@$node" "bash -c ' # 检查是否已安装合适版本的Java if command -v java &> /dev/null; then current_version=\"\$(java -version 2>&1 | head -1 | cut -d\\\" -f2)\" if [[ \"\$current_version\" == \"$JAVA_VERSION\"* ]]; then echo \"Java \$current_version 已安装\" exit 0 else echo \"当前Java版本: \$current_version,需要安装$JAVA_VERSION\" # 卸载旧版本(可选) # sudo yum remove -y java-* 2>/dev/null || true fi fi echo \"开始安装Java $JAVA_VERSION...\" $node_install_cmd if command -v java &> /dev/null; then installed_version=\"\$(java -version 2>&1 | head -1 | cut -d\\\" -f2)\" echo \"安装成功: Java \$installed_version\" exit 0 else echo \"安装失败,Java命令未找到\" exit 1 fi '" 2>&1; then return 0 else return 1 fi } # 检测节点操作系统 detect_node_os() { local node="$1" sudo -u $HADOOP_USER ssh "$HADOOP_USER@$node" "cat /etc/os-release 2>/dev/null | grep '^ID=' | cut -d= -f2 | tr -d '\"'" 2>/dev/null || echo "centos" } # 处理每个节点 for node in $ALL_NODES; do ((total_nodes++)) print_info "处理节点: $node" local node_os=$(detect_node_os "$node") print_info "节点 $node 操作系统: $node_os" # 检查是否已安装合适版本的Java local java_check_result=$(sudo -u $HADOOP_USER ssh "$HADOOP_USER@$node" "command -v java && java -version 2>&1 | head -1" 2>/dev/null) if [ $? -eq 0 ] && [[ "$java_check_result" == *"$JAVA_VERSION"* ]]; then ((installed_count++)) print_success "节点 $node: Java $JAVA_VERSION 已安装" else # 尝试安装 if install_java_on_node "$node" "$node_os"; then ((installed_count++)) print_success "节点 $node: Java $JAVA_VERSION 安装成功" else print_error "节点 $node: Java 安装失败" # 提供手动安装命令 echo -e "${YELLOW}手动安装命令:${NC}" case $node_os in ubuntu|debian) echo -e " sudo -u $HADOOP_USER ssh $HADOOP_USER@$node 'sudo apt-get update && sudo apt-get install -y openjdk-${JAVA_VERSION}-jdk'" ;; *) echo -e " sudo -u $HADOOP_USER ssh $HADOOP_USER@$node 'sudo yum install -y java-${JAVA_VERSION}-openjdk-devel'" ;; esac fi fi # 验证安装 if sudo -u $HADOOP_USER ssh "$HADOOP_USER@$node" "command -v java &> /dev/null" 2>/dev/null; then local version=$(sudo -u $HADOOP_USER ssh "$HADOOP_USER@$node" "java -version 2>&1 | head -1 | cut -d'\"' -f2" 2>/dev/null) if [[ "$version" == *"$JAVA_VERSION"* ]]; then ((verified_count++)) print_success "节点 $node: Java验证通过 ($version)" else print_warning "节点 $node: Java版本不匹配 ($version)" fi fi done # 显示安装统计 echo -e "\n${CYAN}${BOLD}📊 Java安装统计:${NC}" echo -e " 总节点数: ${total_nodes}" echo -e " 安装成功: ${GREEN}${installed_count}/${total_nodes}${NC}" echo -e " 验证通过: ${GREEN}${verified_count}/${total_nodes}${NC}" if [ $verified_count -eq $total_nodes ]; then print_success "所有节点Java安装验证通过!" else print_warning "Java安装不完整,请检查失败节点" # 列出失败的节点 echo -e "\n${YELLOW}需要手动检查的节点:${NC}" for node in $ALL_NODES; do if ! sudo -u $HADOOP_USER ssh "$HADOOP_USER@$node" "command -v java &> /dev/null" 2>/dev/null; then echo -e " ${RED}• $node${NC}" fi done fi print_step_complete } # 模块5:目录结构创建 module_directory_setup() { print_step "目录设置" "创建Hadoop所需的目录结构" # 定义目录列表 local directories=( "$HADOOP_HOME" "$DATA_DIR/hdfs/name" "$DATA_DIR/hdfs/data" "$DATA_DIR/yarn/local" "$DATA_DIR/yarn/logs" "$DATA_DIR/tmp" "$LOG_DIR" "$PID_DIR" "/tmp/hadoop" ) # 创建目录并设置权限 for dir in "${directories[@]}"; do if [ ! -d "$dir" ]; then sudo mkdir -p "$dir" print_info "创建目录: $dir" fi # 设置所有者 sudo chown -R "$HADOOP_USER:$HADOOP_GROUP" "$dir" # 设置权限 if [[ "$dir" == */log* ]] || [[ "$dir" == */run* ]] || [[ "$dir" == */tmp* ]]; then sudo chmod -R 755 "$dir" else sudo chmod -R 750 "$dir" fi done # 设置setgid权限 sudo chmod g+s "$HADOOP_HOME" sudo chmod g+s "$DATA_DIR" print_success "目录结构创建完成" print_step_complete } # 模块6:Hadoop下载和安装 module_hadoop_install() { print_step "Hadoop安装" "下载并安装Hadoop $HADOOP_VERSION" local hadoop_tar="hadoop-$HADOOP_VERSION.tar.gz" local hadoop_url="$HADOOP_MIRROR/hadoop-$HADOOP_VERSION/$hadoop_tar" local download_dir="/tmp" # 检查是否已下载 if [ ! -f "$download_dir/$hadoop_tar" ]; then print_info "下载Hadoop: $HADOOP_VERSION" # 使用wget或curl下载 if command -v wget &> /dev/null; then if ! sudo wget -q "$hadoop_url" -P "$download_dir"; then print_error "下载失败,请检查网络连接或镜像地址" return 1 fi elif command -v curl &> /dev/null; then if ! sudo curl -sSL "$hadoop_url" -o "$download_dir/$hadoop_tar"; then print_error "下载失败,请检查网络连接或镜像地址" return 1 fi else print_error "没有找到wget或curl,无法下载Hadoop" return 1 fi if [ $? -eq 0 ]; then print_success "下载完成" else print_error "下载失败,请检查网络连接或镜像地址" return 1 fi else print_info "使用已下载的Hadoop包" fi # 检查文件完整性 if [ ! -s "$download_dir/$hadoop_tar" ]; then print_error "Hadoop包文件大小为0,可能下载不完整" return 1 fi # 备份现有安装 if [ -d "$HADOOP_HOME" ] && [ -d "$HADOOP_HOME/bin" ]; then local backup_dir="$HADOOP_HOME-backup-$(date +%Y%m%d-%H%M%S)" sudo mv "$HADOOP_HOME" "$backup_dir" print_info "备份现有安装到: $backup_dir" fi # 创建目标目录 sudo mkdir -p "$(dirname $HADOOP_HOME)" # 解压安装 if ! sudo tar -xzf "$download_dir/$hadoop_tar" -C "$(dirname $HADOOP_HOME)"; then print_error "解压Hadoop包失败,文件可能损坏" return 1 fi # 检查解压后的目录 if [ ! -d "$(dirname $HADOOP_HOME)/hadoop-$HADOOP_VERSION" ]; then print_error "解压后未找到 hadoop-$HADOOP_VERSION 目录" return 1 fi # 移动目录 if [ -d "$HADOOP_HOME" ]; then sudo rm -rf "$HADOOP_HOME" fi sudo mv "$(dirname $HADOOP_HOME)/hadoop-$HADOOP_VERSION" "$HADOOP_HOME" # 设置所有者 sudo chown -R "$HADOOP_USER:$HADOOP_GROUP" "$HADOOP_HOME" # 验证安装(文件存在性) if [ -f "$HADOOP_HOME/bin/hadoop" ] && [ -d "$HADOOP_HOME/etc/hadoop" ]; then print_success "Hadoop安装成功: $HADOOP_HOME" # 尝试验证版本,但不强制要求 if command -v java &> /dev/null; then # 检测Java安装路径 local java_home_for_test=$(dirname $(dirname $(readlink -f $(which java)))) if [ -d "$java_home_for_test" ]; then # 使用检测到的JAVA_HOME运行hadoop version local version_output=$(sudo -u "$HADOOP_USER" env JAVA_HOME="$java_home_for_test" "$HADOOP_HOME/bin/hadoop" version 2>&1 | head -2) if echo "$version_output" | grep -q "Hadoop"; then print_info "$version_output" else print_warning "版本检查失败,将在环境变量设置后重试" print_info "已成功安装Hadoop $HADOOP_VERSION" fi else print_info "Hadoop $HADOOP_VERSION 已成功安装" print_info "Java环境将在后续步骤中配置" fi else print_info "Hadoop $HADOOP_VERSION 已成功安装" print_info "注意:Java未安装或未找到,将在后续步骤中处理" fi else print_error "Hadoop安装失败" return 1 fi print_step_complete return 0 } # 模块7:配置模板生成 module_config_templates() { print_step "配置模板" "生成Hadoop配置文件模板" # 创建配置目录 local conf_dir="$HADOOP_HOME/etc/hadoop" # 生成hadoop-env.sh cat > /tmp/hadoop-env.sh.template << 'EOF' #!/usr/bin/env bash # JAVA_HOME export JAVA_HOME=${JAVA_HOME} # Hadoop配置目录 export HADOOP_CONF_DIR=${HADOOP_CONF_DIR} # Hadoop日志目录 export HADOOP_LOG_DIR=${HADOOP_LOG_DIR} # Hadoop PID目录 export HADOOP_PID_DIR=${HADOOP_PID_DIR} # Hadoop堆内存设置 export HADOOP_HEAPSIZE_MAX=1024 export HADOOP_HEAPSIZE=1024 # Hadoop 3.x 服务用户配置(必须) export HDFS_NAMENODE_USER=${HADOOP_USER} export HDFS_DATANODE_USER=${HADOOP_USER} export HDFS_SECONDARYNAMENODE_USER=${HADOOP_USER} export YARN_RESOURCEMANAGER_USER=${HADOOP_USER} export YARN_NODEMANAGER_USER=${HADOOP_USER} export HDFS_JOURNALNODE_USER=${HADOOP_USER} export HDFS_ZKFC_USER=${HADOOP_USER} export MAPRED_HISTORYSERVER_USER=${HADOOP_USER} # 垃圾回收优化 export HADOOP_OPTS="$HADOOP_OPTS -XX:+UseG1GC -XX:MaxGCPauseMillis=200" export HADOOP_OPTS="$HADOOP_OPTS -XX:+UnlockExperimentalVMOptions" export HADOOP_OPTS="$HADOOP_OPTS -XX:+UseContainerSupport" # 网络优化 export HADOOP_CLIENT_OPTS="$HADOOP_CLIENT_OPTS -Djava.net.preferIPv4Stack=true" EOF # 生成core-site.xml模板 cat > /tmp/core-site.xml.template << 'EOF' <?xml version="1.0" encoding="UTF-8"?> <?xml-stylesheet type="text/xsl" href="configuration.xsl"?> <configuration> <!-- HDFS 默认文件系统地址 --> <property> <name>fs.defaultFS</name> <value>hdfs://${NAMENODE_NODE}:9820</value> <description>NameNode RPC地址,客户端通过此地址连接HDFS</description> </property> <!-- Hadoop 临时目录 --> <property> <name>hadoop.tmp.dir</name> <value>${DATA_DIR}/tmp</value> <description>Hadoop临时文件目录</description> </property> <!-- I/O 缓冲区大小 --> <property> <name>io.file.buffer.size</name> <value>131072</value> <description>读写操作的缓冲区大小</description> </property> <!-- WebHDFS 启用 --> <property> <name>dfs.webhdfs.enabled</name> <value>true</value> <description>启用WebHDFS REST API</description> </property> <!-- 静态用户配置(Web UI) --> <property> <name>hadoop.http.staticuser.user</name> <value>${HADOOP_USER}</value> <description>Web UI的匿名用户</description> </property> </configuration> EOF # 生成hdfs-site.xml模板 cat > /tmp/hdfs-site.xml.template << 'EOF' <?xml version="1.0" encoding="UTF-8"?> <?xml-stylesheet type="text/xsl" href="configuration.xsl"?> <configuration> <!-- 副本数 --> <property> <name>dfs.replication</name> <value>3</value> <description>数据块副本数量</description> </property> <!-- NameNode RPC 地址 --> <property> <name>dfs.namenode.rpc-address</name> <value>${NAMENODE_NODE}:9820</value> <description>NameNode RPC服务地址</description> </property> <!-- NameNode HTTP 地址 --> <property> <name>dfs.namenode.http-address</name> <value>${NAMENODE_NODE}:9870</value> <description>NameNode Web UI地址</description> </property> <!-- SecondaryNameNode HTTP 地址 --> <property> <name>dfs.namenode.secondary.http-address</name> <value>${SECONDARY_NODE}:9868</value> <description>SecondaryNameNode Web UI地址</description> </property> <!-- NameNode 元数据存储目录 --> <property> <name>dfs.namenode.name.dir</name> <value>file://${DATA_DIR}/hdfs/name</value> <description>NameNode元数据存储目录</description> </property> <!-- DataNode 数据存储目录 --> <property> <name>dfs.datanode.data.dir</name> <value>file://${DATA_DIR}/hdfs/data</value> <description>DataNode数据存储目录</description> </property> <!-- 数据块大小 --> <property> <name>dfs.blocksize</name> <value>128m</value> <description>HDFS数据块大小</description> </property> <!-- 权限检查(开发环境可关闭) --> <property> <name>dfs.permissions.enabled</name> <value>false</value> <description>是否启用HDFS权限检查</description> </property> </configuration> EOF # 生成yarn-site.xml模板 cat > /tmp/yarn-site.xml.template << 'EOF' <?xml version="1.0" encoding="UTF-8"?> <?xml-stylesheet type="text/xsl" href="configuration.xsl"?> <configuration> <!-- ResourceManager 主机名 --> <property> <name>yarn.resourcemanager.hostname</name> <value>${RESOURCEMANAGER_NODE}</value> <description>ResourceManager所在主机</description> </property> <!-- ResourceManager Web UI地址 --> <property> <name>yarn.resourcemanager.webapp.address</name> <value>${RESOURCEMANAGER_NODE}:8088</value> <description>ResourceManager Web UI地址</description> </property> <!-- NodeManager 辅助服务 --> <property> <name>yarn.nodemanager.aux-services</name> <value>mapreduce_shuffle</value> <description>NodeManager辅助服务</description> </property> <!-- NodeManager 可用内存 --> <property> <name>yarn.nodemanager.resource.memory-mb</name> <value>8192</value> <description>NodeManager可用内存(MB)</description> </property> <!-- NodeManager 可用CPU核数 --> <property> <name>yarn.nodemanager.resource.cpu-vcores</name> <value>8</value> <description>NodeManager可用CPU核数</description> </property> <!-- 启用日志聚合 --> <property> <name>yarn.log-aggregation-enable</name> <value>true</value> <description>启用日志聚合功能</description> </property> <!-- JobHistory Server地址 --> <property> <name>yarn.log.server.url</name> <value>http://${JOBHISTORY_NODE}:19888/jobhistory/logs</value> <description>JobHistory Server日志URL</description> </property> <!-- 日志保留时间 --> <property> <name>yarn.log-aggregation.retain-seconds</name> <value>604800</value> <description>日志保留时间(秒)</description> </property> <!--环境变量的继承.3.1.3的bug.3.2.x时,就不需要再配置环境变量了--> <property> <name>yarn.nodemanager.env-whitelist</name> <value>JAVA_HOME,HADOOP_COMMON_HOME,HADOOP_HDFS_HOME,HADOOP_CONF_DIR,CLASSPATH_PREPEND_DISTCACHE,HADOOP_YARN_HOME,HADOOP_HOME,HADOOP_MAPRED_HOME</value> </property> </configuration> EOF # 生成mapred-site.xml模板 cat > /tmp/mapred-site.xml.template << 'EOF' <?xml version="1.0" encoding="UTF-8"?> <?xml-stylesheet type="text/xsl" href="configuration.xsl"?> <configuration> <!-- MapReduce框架 --> <property> <name>mapreduce.framework.name</name> <value>yarn</value> <description>指定MapReduce运行在YARN上</description> </property> <!-- JobHistory Server地址 --> <property> <name>mapreduce.jobhistory.address</name> <value>${JOBHISTORY_NODE}:10020</value> <description>JobHistory Server RPC地址</description> </property> <!-- JobHistory Server Web UI地址 --> <property> <name>mapreduce.jobhistory.webapp.address</name> <value>${JOBHISTORY_NODE}:19888</value> <description>JobHistory Server Web UI地址</description> </property> <!-- Map任务内存设置 --> <property> <name>mapreduce.map.memory.mb</name> <value>2048</value> <description>Map任务内存(MB)</description> </property> <!-- Reduce任务内存设置 --> <property> <name>mapreduce.reduce.memory.mb</name> <value>4096</value> <description>Reduce任务内存(MB)</description> </property> <!-- 启用Map输出压缩 --> <property> <name>mapreduce.map.output.compress</name> <value>true</value> <description>启用Map输出压缩</description> </property> <!-- Map输出压缩编码器 --> <property> <name>mapreduce.map.output.compress.codec</name> <value>org.apache.hadoop.io.compress.SnappyCodec</value> <description>Map输出压缩编码器</description> </property> </configuration> EOF # 生成workers文件模板 cat > /tmp/workers.template << 'EOF' # Hadoop集群工作节点列表 # 每行一个节点主机名或IP ${ALL_NODES} EOF # 生成环境变量模板 cat > /tmp/hadoop-profile.template << 'EOF' # Hadoop环境变量配置 export HADOOP_HOME=${HADOOP_HOME} export HADOOP_CONF_DIR=${HADOOP_HOME}/etc/hadoop export HADOOP_MAPRED_HOME=${HADOOP_HOME} export HADOOP_COMMON_HOME=${HADOOP_HOME} export HADOOP_HDFS_HOME=${HADOOP_HOME} export HADOOP_CLASSPATH=$(${HADOOP_HOME}/bin/hadoop classpath 2>/dev/null) export YARN_HOME=${HADOOP_HOME} export PATH=${PATH}:${HADOOP_HOME}/bin:${HADOOP_HOME}/sbin # Java环境 export JAVA_HOME=${JAVA_HOME} export PATH=${PATH}:${JAVA_HOME}/bin EOF print_success "配置模板生成完成" print_step_complete } # 模块8:配置文件渲染 module_config_render() { print_step "配置渲染" "将模板渲染为实际配置文件" local conf_dir="$HADOOP_HOME/etc/hadoop" # 准备替换变量 local vars=" s|\\\${HADOOP_USER}|$HADOOP_USER|g s|\\\${HADOOP_GROUP}|$HADOOP_GROUP|g s|\\\${HADOOP_HOME}|$HADOOP_HOME|g s|\\\${DATA_DIR}|$DATA_DIR|g s|\\\${LOG_DIR}|$LOG_DIR|g s|\\\${PID_DIR}|$PID_DIR|g s|\\\${JAVA_HOME}|$JAVA_HOME|g s|\\\${NAMENODE_NODE}|$NAMENODE_NODE|g s|\\\${RESOURCEMANAGER_NODE}|$RESOURCEMANAGER_NODE|g s|\\\${SECONDARY_NODE}|$SECONDARY_NODE|g s|\\\${JOBHISTORY_NODE}|$JOBHISTORY_NODE|g s|\\\${ALL_NODES}|$ALL_NODES|g " # 渲染配置文件 local templates=( "hadoop-env.sh" "core-site.xml" "hdfs-site.xml" "yarn-site.xml" "mapred-site.xml" "workers" ) for template in "${templates[@]}"; do if [ -f "/tmp/$template.template" ]; then # 特殊处理:hadoop-env.sh 中的 ${JAVA_HOME} 应该保留为变量,不进行替换 if [ "$template" = "hadoop-env.sh" ]; then # 对于 hadoop-env.sh,我们只替换非 JAVA_HOME 的变量 local env_vars=" s|\\\${HADOOP_USER}|$HADOOP_USER|g s|\\\${HADOOP_HOME}|$HADOOP_HOME|g s|\\\${LOG_DIR}|$LOG_DIR|g s|\\\${PID_DIR}|$PID_DIR|g " sed "$env_vars" "/tmp/$template.template" > "$conf_dir/$template" else sed "$vars" "/tmp/$template.template" > "$conf_dir/$template" fi sudo chown "$HADOOP_USER:$HADOOP_GROUP" "$conf_dir/$template" print_info "生成配置文件: $template" fi done # 渲染环境变量文件。 sed "$vars" "/tmp/hadoop-profile.template" > "/tmp/hadoop-profile.sh" sudo cp "/tmp/hadoop-profile.sh" "/etc/profile.d/hadoop.sh" print_success "配置文件渲染完成" print_step_complete } # 模块9:配置分发 module_config_distribute() { print_step "配置分发" "将配置分发到所有集群节点" local current_host=$(hostname) # 先测试SSH免密登录是否正常工作 print_info "检查SSH免密登录状态..." local ssh_working_nodes=() local ssh_problem_nodes=() for node in $ALL_NODES; do if [ "$node" != "$current_host" ]; then # 测试SSH免密登录 if sudo -u "$HADOOP_USER" ssh -o ConnectTimeout=5 -o BatchMode=yes "$node" "exit" 2>/dev/null; then ssh_working_nodes+=("$node") print_success "节点 $node: SSH免密登录正常" else ssh_problem_nodes+=("$node") print_warning "节点 $node: SSH免密登录有问题,将使用密码或备选方案" fi fi done # 如果有SSH问题的节点,尝试使用sshpass local use_sshpass=false if [ ${#ssh_problem_nodes[@]} -gt 0 ] && command -v sshpass &> /dev/null; then echo -e "${YELLOW}检测到sshpass工具,是否使用密码自动登录?(y/n): ${NC}\c" read -r use_sshpass_choice if [[ "$use_sshpass_choice" =~ ^[Yy]$ ]]; then use_sshpass=true print_info "将使用sshpass进行密码自动登录" fi fi # 分发函数,支持两种模式 distribute_with_ssh() { local node="$1" local cmd="$2" if [ "$use_sshpass" = true ] && [ -n "$HADOOP_PASSWORD" ]; then # 使用sshpass执行命令 sshpass -p "$HADOOP_PASSWORD" sudo -u "$HADOOP_USER" ssh -o StrictHostKeyChecking=no "$HADOOP_USER@$node" "$cmd" else # 使用普通SSH(期望免密登录) sudo -u "$HADOOP_USER" ssh -o StrictHostKeyChecking=no "$HADOOP_USER@$node" "$cmd" fi } distribute_file_with_scp() { local src="$1" local node="$2" local dst="$3" if [ "$use_sshpass" = true ] && [ -n "$HADOOP_PASSWORD" ]; then # 使用sshpass执行scp sshpass -p "$HADOOP_PASSWORD" scp -o StrictHostKeyChecking=no "$src" "$HADOOP_USER@$node:$dst" else # 使用普通scp(期望免密登录) sudo -u "$HADOOP_USER" scp -o StrictHostKeyChecking=no "$src" "$HADOOP_USER@$node:$dst" fi } # 定义要分发的文件和目录 local distribute_items=( "$HADOOP_HOME" "/etc/profile.d/hadoop.sh" "/etc/sudoers.d/hadoop-$HADOOP_USER" ) # 在主节点读取文件内容 local profile_content if [ -f "/tmp/hadoop-profile.sh" ]; then profile_content=$(cat "/tmp/hadoop-profile.sh") else print_warning "主节点环境变量文件不存在,跳过" continue fi for node in $ALL_NODES; do # 获取节点的实际Java路径 print_info "获取节点 $node 的实际Java路径..." local node_java_home=$(distribute_with_ssh "$node" " if command -v java &> /dev/null; then java_cmd=\$(which java) if [ -L \"\$java_cmd\" ]; then java_cmd=\$(readlink -f \"\$java_cmd\") fi dirname \"\$(dirname \"\$java_cmd\")\" else echo '' fi " 2>/dev/null) if [ -n "$node_java_home" ] && [ -d "$node_java_home" ]; then print_success "节点 $node 的Java路径: $node_java_home" else print_warning "节点 $node 无法获取Java路径,使用默认: /usr/lib/jvm/java-11-openjdk" node_java_home="/usr/lib/jvm/java-11-openjdk" fi if [ "$node" != "$(hostname)" ]; then print_info "分发配置到节点: $node" # 创建目标目录 #sudo -u "$HADOOP_USER" ssh "$HADOOP_USER@$node" "sudo mkdir -p $(dirname $HADOOP_HOME)" # 1. 在远程节点上创建目标目录并设置权限(使用sudo) print_info "在节点 $node 上创建目录" distribute_with_ssh "$node" " sudo mkdir -p $(dirname $HADOOP_HOME) sudo mkdir -p '$DATA_DIR/hdfs/name' sudo mkdir -p '$DATA_DIR/hdfs/data' sudo mkdir -p '$DATA_DIR/yarn/local' sudo mkdir -p '$DATA_DIR/yarn/logs' sudo mkdir -p '$DATA_DIR/tmp' sudo mkdir -p '$LOG_DIR' sudo mkdir -p '$PID_DIR' sudo mkdir -p '/tmp/hadoop' # 设置所有者和权限(需要在创建后设置) echo '设置目录所有者和权限...' sudo chown -R '$HADOOP_USER:$HADOOP_GROUP' '$(dirname $HADOOP_HOME)' sudo chown -R '$HADOOP_USER:$HADOOP_GROUP' '$DATA_DIR' sudo chown -R '$HADOOP_USER:$HADOOP_GROUP' '$LOG_DIR' sudo chown -R '$HADOOP_USER:$HADOOP_GROUP' '$PID_DIR' sudo chown -R '$HADOOP_USER:$HADOOP_GROUP' '/tmp/hadoop' # 设置目录权限 sudo chmod -R 755 '$(dirname $HADOOP_HOME)' sudo chmod -R 755 '$DATA_DIR' sudo chmod -R 755 '$LOG_DIR' sudo chmod -R 755 '$PID_DIR' sudo chmod -R 777 '/tmp/hadoop' " 2>/dev/null || print_warning "节点 $node 目录创建失败" # 2. 分发Hadoop安装目录(排除不需要的文件,使用tar+ssh,避免权限问题) # 先检查远程目录是否可写 if distribute_with_ssh "$node" "test -w $(dirname $HADOOP_HOME)" 2>/dev/null; then # 使用tar管道传输 cd "$(dirname $HADOOP_HOME)" tar czf - "$(basename $HADOOP_HOME)" 2>/dev/null | \ distribute_with_ssh "$node" "tar xzf - -C $(dirname $HADOOP_HOME) && chmod -R 755 $HADOOP_HOME" 2>/dev/null if [ $? -ne 0 ]; then print_warning "tar传输失败,尝试rsync..." # 使用rsync rsync -avz --exclude='logs/*' --exclude='*.pid' "$HADOOP_HOME/" "$HADOOP_USER@$node:$HADOOP_HOME/" 2>/dev/null || true fi else print_warning "目录不可写,尝试使用sudo..." # 使用sudo创建并设置权限 distribute_with_ssh "$node" "sudo tar xzf - -C $(dirname $HADOOP_HOME) && sudo chown -R $HADOOP_USER:$HADOOP_GROUP $HADOOP_HOME" 2>/dev/null < <(tar czf - -C "$(dirname $HADOOP_HOME)" "$(basename $HADOOP_HOME)") 2>/dev/null fi # 3. 分发环境变量文件 print_info "分发环境变量配置" # 在远程节点上修改JAVA_HOME distribute_with_ssh "$node" " # 创建临时文件 cat > /tmp/hadoop-profile-template.sh << 'EOF' ${profile_content} EOF # 备份原始文件 if [ -f /tmp/hadoop-profile-template.sh ]; then # 获取原文件中的旧Java路径 old_java_path=\$(grep 'export JAVA_HOME=' /tmp/hadoop-profile-template.sh | cut -d'=' -f2) # 使用节点的实际Java路径替换JAVA_HOME sed -i \"s|export JAVA_HOME=.*|export JAVA_HOME=${node_java_home}|\" /tmp/hadoop-profile-template.sh if [ -n \"\$old_java_path\" ]; then # 替换PATH中的旧Java路径 sed -i \"s|:\${old_java_path}/bin|:${node_java_home}/bin|g\" /tmp/hadoop-profile-template.sh fi # 安装修改后的文件 sudo cp /tmp/hadoop-profile-template.sh /etc/profile.d/hadoop.sh sudo chmod 644 /etc/profile.d/hadoop.sh echo \"环境变量文件已更新\" else echo '错误:找不到原始环境变量文件' fi " 2>/dev/null || print_warning "节点 $node 环境变量设置失败" # 4. 分发sudoers文件 print_info "分发sudoers配置" if [ -f "/etc/sudoers.d/hadoop-$HADOOP_USER" ]; then distribute_file_with_scp "/etc/sudoers.d/hadoop-$HADOOP_USER" "$node" "/tmp/hadoop-sudoers" 2>/dev/null distribute_with_ssh "$node" "sudo cp /tmp/hadoop-sudoers /etc/sudoers.d/hadoop-$HADOOP_USER && sudo chmod 440 /etc/sudoers.d/hadoop-$HADOOP_USER && sudo rm -f /tmp/hadoop-sudoers" 2>/dev/null || true fi print_success "节点 $node 配置分发完成" else #主节点需要更新环境变量配置文件 if [ -f '/etc/profile.d/hadoop.sh' ]; then # 获取原文件中的旧Java路径 local old_java_path=$(grep 'export JAVA_HOME=' /etc/profile.d/hadoop.sh | cut -d'=' -f2) # 使用节点的实际Java路径替换JAVA_HOME sed -i "s|export JAVA_HOME=.*|export JAVA_HOME=${node_java_home}|" /etc/profile.d/hadoop.sh # 同时更新PATH中的Java路径 if [ -n "$old_java_path" ]; then # 替换PATH中的旧Java路径 sed -i "s|:${old_java_path}/bin|:${node_java_home}/bin|g" /etc/profile.d/hadoop.sh fi echo "环境变量文件已更新" echo "JAVA_HOME设置为: ${node_java_home}" else echo "错误:找不到原始环境变量文件" fi fi done # 验证分发结果 print_info "验证分发结果" local verification_passed=0 local verification_total=0 for node in $ALL_NODES; do if [ "$node" != "$(hostname)" ]; then ((verification_total++)) # 检查Hadoop是否成功分发 if distribute_with_ssh "$node" "[ -f '$HADOOP_HOME/bin/hadoop' ]" 2>/dev/null; then ((verification_passed++)) print_success "节点 $node: Hadoop安装验证成功" else print_warning "节点 $node: Hadoop安装可能不完整" fi fi done if [ $verification_passed -eq $verification_total ]; then print_success "所有节点配置分发验证通过 ($verification_passed/$verification_total)" else print_warning "部分节点配置分发需要检查 ($verification_passed/$verification_total)" fi print_step_complete } # 创建一个专门的修复/home/hadoop_user/.bashrc文件的函数 fix_bashrc_on_node() { local node=$1 local hadoop_user=$2 print_info "修复节点 $node 的.bashrc文件..." sudo -u "$HADOOP_USER" ssh "$HADOOP_USER@$node" ' # 备份文件 backup_file="$HOME/.bashrc.backup.$(date +%Y%m%d%H%M%S)" cp ~/.bashrc "$backup_file" # 创建全新的.bashrc cat > ~/.bashrc << "EOF" # .bashrc # Source global definitions if [ -f /etc/bashrc ]; then . /etc/bashrc fi # Uncomment the following line if you don't like systemctl's auto-paging feature: # export SYSTEMD_PAGER= # User specific aliases and functions # 加载Hadoop环境变量 if [ -f /etc/profile.d/hadoop.sh ]; then source /etc/profile.d/hadoop.sh fi EOF echo "修复完成,备份在: $backup_file" ' 2>/dev/null print_success "节点 $node .bashrc修复成功" || \ print_warning "节点 $node .bashrc修复失败" } # 模块10:环境变量生效 module_environment_setup() { print_step "环境设置" "设置Hadoop环境变量" # 为当前用户设置环境变量 if ! grep -q "HADOOP_HOME" "/home/$HADOOP_USER/.bashrc"; then cat >> "/home/$HADOOP_USER/.bashrc" << EOF # 加载Hadoop环境变量 if [ -f /etc/profile.d/hadoop.sh ]; then source /etc/profile.d/hadoop.sh fi EOF print_success "为 $HADOOP_USER 用户设置环境变量" fi # 3. 配置所有从节点的.bashrc for node in $ALL_NODES; do if [ "$node" != "$(hostname)" ]; then print_info "配置节点 $node 的.bashrc..." fix_bashrc_on_node "$node" "$HADOOP_USER" fi done # 立即生效(使用更安全的方法) if sudo -u "$HADOOP_USER" bash -c "source ~/.bashrc 2>/dev/null"; then print_success "环境变量立即生效成功" else print_warning "环境变量立即生效失败,可能需要重新登录" fi # 在所有节点生效 for node in $ALL_NODES; do if [ "$node" != "$(hostname)" ]; then print_info "在节点 $node 上生效环境变量..." if sudo -u "$HADOOP_USER" ssh "$HADOOP_USER@$node" "source ~/.bashrc 2>/dev/null"; then print_success "节点 $node 环境变量生效成功" else print_warning "节点 $node 环境变量生效失败" fi fi done # 5. 最终验证 print_info "最终验证所有节点环境变量..." local all_nodes_valid=true for node in $ALL_NODES; do if sudo -u "$HADOOP_USER" ssh "$HADOOP_USER@$node" " if [ -n \"\\\$JAVA_HOME\" ] && [ -n \"\\\$HADOOP_HOME\" ]; then echo '✓ 节点 $node: 环境变量已设置' exit 0 else echo '✗ 节点 $node: 环境变量未正确设置' exit 1 fi " 2>/dev/null; then print_success "节点 $node 环境变量验证通过" else print_warning "节点 $node 环境变量验证失败" all_nodes_valid=false fi done if [ "$all_nodes_valid" = true ]; then print_success "所有节点环境变量配置验证通过 ✓" else print_warning "部分节点环境变量配置需要检查" fi print_success "环境变量设置完成" print_step_complete } # 模块11:HDFS初始化 module_hdfs_init() { print_step "HDFS初始化" "格式化HDFS NameNode" # 添加调试信息 print_info "检查环境变量状态..." print_info "当前用户: $(whoami)" print_info "HADOOP_HOME: $HADOOP_HOME" print_info "JAVA_HOME: $JAVA_HOME" print_info "PATH: $PATH" # 检查hdfs命令是否存在 #if command -v hdfs &> /dev/null; then # print_success "hdfs命令在PATH中找到" # print_info "hdfs路径: $(which hdfs)" #else # print_warning "hdfs命令不在PATH中" # print_info "使用绝对路径: $HADOOP_HOME/bin/hdfs" #fi # ## 检查Java #if command -v java &> /dev/null; then # print_success "java命令在PATH中找到" # print_info "java路径: $(which java)" #else # print_error "java命令不在PATH中" # return 1 #fi # 检查是否已格式化 local name_dir="$DATA_DIR/hdfs/name" if [ -d "$name_dir" ] && [ "$(ls -A $name_dir 2>/dev/null)" ]; then print_warning "NameNode数据目录非空,可能已格式化" echo -e "${YELLOW}是否重新格式化HDFS?(y/n): ${NC}\c" read -r format_choice if [[ ! "$format_choice" =~ ^[Yy]$ ]]; then print_info "跳过HDFS格式化" print_step_complete return 0 fi # 备份现有数据 local backup_dir="$name_dir-backup-$(date +%Y%m%d-%H%M%S)" sudo mv "$name_dir" "$backup_dir" sudo mkdir -p "$name_dir" sudo chown -R "$HADOOP_USER:$HADOOP_GROUP" "$name_dir" print_info "现有数据备份到: $backup_dir" fi # 执行格式化 print_info "执行HDFS格式化" # ========== 关键修复:使用完整路径,避免 sudo PATH 问题 ========== local hdfs_cmd="$HADOOP_HOME/bin/hdfs" # 执行格式化 if sudo -u "$HADOOP_USER" HDFS_CMD="$hdfs_cmd" bash -c ' # 加载hadoop用户的环境 if [ -f ~/.bashrc ]; then source ~/.bashrc fi if [ -f /etc/profile.d/hadoop.sh ]; then source /etc/profile.d/hadoop.sh fi echo "=== 格式化环境 ===" echo "JAVA_HOME: $JAVA_HOME" echo "HADOOP_HOME: $HADOOP_HOME" echo "使用的hdfs命令: $HDFS_CMD" echo "Java验证:" java -version 2>&1 | head -3 echo "=== 开始格式化 ===" # 执行格式化 "$HDFS_CMD" namenode -format -force -nonInteractive 2>&1 ' 2>&1 | tee /tmp/hdfs-format.log; then # 检查是否真正成功 if grep -q -E "(successfully formatted|Storage directory.*has been|Exiting with status 0)" /tmp/hdfs-format.log; then print_success "HDFS格式化成功" # 显示成功信息 print_info "成功信息:" grep -E "(successfully|has been|Exiting)" /tmp/hdfs-format.log else print_warning "命令执行完成但未找到标准成功标志" print_info "最后输出:" tail -5 /tmp/hdfs-format.log fi else print_error "HDFS格式化失败" # 显示错误信息 print_info "错误详情:" grep -i "error\|fail\|not found" /tmp/hdfs-format.log | head -10 || tail -10 /tmp/hdfs-format.log return 1 fi print_step_complete } # 模块12:防火墙配置 module_firewall_setup() { print_step "防火墙配置" "配置必要的防火墙规则" # 检查防火墙服务 local firewall_cmd="" if command -v ufw &> /dev/null; then firewall_cmd="ufw" elif command -v firewall-cmd &> /dev/null; then firewall_cmd="firewalld" elif command -v iptables &> /dev/null; then firewall_cmd="iptables" else print_warning "未检测到防火墙工具,跳过配置" print_step_complete return 0 fi # 配置端口规则 local ports=("9820" "9870" "9866" "9864" "9868" "8088" "8042" "19888" "10020") for port in "${ports[@]}"; do case $firewall_cmd in ufw) sudo ufw allow "$port/tcp" > /dev/null 2>&1 ;; firewalld) sudo firewall-cmd --permanent --add-port="$port/tcp" > /dev/null 2>&1 ;; iptables) sudo iptables -A INPUT -p tcp --dport "$port" -j ACCEPT > /dev/null 2>&1 ;; esac done # 应用配置 case $firewall_cmd in ufw) sudo ufw reload > /dev/null 2>&1 ;; firewalld) sudo firewall-cmd --reload > /dev/null 2>&1 ;; iptables) sudo service iptables save > /dev/null 2>&1 2>/dev/null || true ;; esac print_success "防火墙规则配置完成" print_step_complete } # 模块13:集群启动 module_cluster_start() { print_step "集群启动" "启动Hadoop集群服务" # 确保使用完整路径 local hadoop_sbin="$HADOOP_HOME/sbin" local hadoop_bin="$HADOOP_HOME/bin" # 启动HDFS,NameNode 102 print_info "启动HDFS" if [ -f "$hadoop_sbin/start-dfs.sh" ]; then #sudo -u "$HADOOP_USER" "$hadoop_sbin/start-dfs.sh" sudo -u "$HADOOP_USER" ssh "$HADOOP_USER@$NAMENODE_NODE" << 'EOF' # 加载hadoop用户的环境 if [ -f ~/.bashrc ]; then source ~/.bashrc fi if [ -f /etc/profile.d/hadoop.sh ]; then source /etc/profile.d/hadoop.sh fi # 启动HDFS "$HADOOP_HOME/sbin/start-dfs.sh" EOF 2>/dev/null else print_error "找不到 start-dfs.sh: $hadoop_sbin/start-dfs.sh" return 1 fi # 启动YARN 103 print_info "启动YARN" if [ -f "$hadoop_sbin/start-yarn.sh" ]; then #sudo -u "$HADOOP_USER" "$hadoop_sbin/start-yarn.sh" sudo -u "$HADOOP_USER" ssh "$HADOOP_USER@$RESOURCEMANAGER_NODE" << 'EOF' # 加载hadoop用户的环境 if [ -f ~/.bashrc ]; then source ~/.bashrc fi if [ -f /etc/profile.d/hadoop.sh ]; then source /etc/profile.d/hadoop.sh fi # 启动YARN "$HADOOP_HOME/sbin/start-yarn.sh" EOF 2>/dev/null print_success "YARN启动命令已执行" else print_error "找不到 start-yarn.sh: $hadoop_sbin/start-yarn.sh" return 1 fi # 启动JobHistory Server print_info "启动JobHistory Server" if [ -f "$hadoop_bin/mapred" ]; then #sudo -u "$HADOOP_USER" "$hadoop_bin/mapred" --daemon start historyserver sudo -u "$HADOOP_USER" ssh "$HADOOP_USER@$JOBHISTORY_NODE" << 'EOF' # 加载hadoop用户的环境 if [ -f ~/.bashrc ]; then source ~/.bashrc fi if [ -f /etc/profile.d/hadoop.sh ]; then source /etc/profile.d/hadoop.sh fi # 启动JobHistory Server "$HADOOP_HOME/bin/mapred" --daemon start historyserver EOF 2>/dev/null print_success "JobHistory Server启动命令已执行" else print_error "找不到 mapred 命令: $hadoop_bin/mapred" fi # 单独启动SecondaryNameNode 104 print_info "启动SecondaryNameNode" local hdfs_cmd="$HADOOP_HOME/bin/hdfs" if [ -f "$hdfs_cmd" ]; then #sudo -u "$HADOOP_USER" ssh "$SECONDARY_NODE" "'$hdfs_cmd' --daemon start secondarynamenode" sudo -u "$HADOOP_USER" ssh "$HADOOP_USER@$SECONDARY_NODE" << 'EOF' # 加载hadoop用户的环境 if [ -f ~/.bashrc ]; then source ~/.bashrc fi if [ -f /etc/profile.d/hadoop.sh ]; then source /etc/profile.d/hadoop.sh fi # 单独启动SecondaryNameNode "$HADOOP_HOME/bin/hdfs" --daemon start EOF 2>/dev/null print_success "SecondaryNameNode启动命令已执行" else print_error "找不到 hdfs 命令: $hdfs_cmd" fi # 等待服务启动 print_info "等待服务启动..." sleep 10 print_step_complete } # 模块14:集群验证 module_cluster_validate() { print_step "集群验证" "验证Hadoop集群功能" local validation_passed=0 local validation_total=5 local hadoop_bin="$HADOOP_HOME/bin" # 测试1: HDFS基本操作 print_info "测试1: HDFS基本操作..." sudo -u "$HADOOP_USER" HADOOP_BIN="$hadoop_bin" bash -c ' # 加载hadoop用户的环境 if [ -f ~/.bashrc ]; then source ~/.bashrc fi if [ -f /etc/profile.d/hadoop.sh ]; then source /etc/profile.d/hadoop.sh fi "$HADOOP_BIN/hdfs" dfs -mkdir -p /test-validation "$HADOOP_BIN/hdfs" dfs -put /etc/hosts /test-validation/hosts-copy ' #sudo -u "$HADOOP_USER" hdfs dfs -put /etc/hosts /test-validation/hosts-copy local hdfs_test_result=$? if [ $hdfs_test_result -eq 0 ]; then print_success "HDFS基本操作测试通过" validation_passed=$((validation_passed + 1)) else print_error "HDFS基本操作测试失败" fi # 测试2: HDFS状态检查 print_info "测试2: HDFS状态检查..." #sudo -u "$HADOOP_USER" hdfs dfsadmin -report 2>&1 | grep -q "Live datanodes" sudo -u "$HADOOP_USER" HADOOP_BIN="$hadoop_bin" bash -c ' # 加载hadoop用户的环境 if [ -f ~/.bashrc ]; then source ~/.bashrc fi if [ -f /etc/profile.d/hadoop.sh ]; then source /etc/profile.d/hadoop.sh fi "$HADOOP_BIN/hdfs" dfsadmin -report ' local hdfs_status_result=$? if [ $hdfs_status_result -eq 0 ]; then local datanodes_count=$(sudo -u "$HADOOP_USER" HADOOP_BIN="$hadoop_bin" bash -c ' # 加载hadoop用户的环境 if [ -f ~/.bashrc ]; then source ~/.bashrc fi if [ -f /etc/profile.d/hadoop.sh ]; then source /etc/profile.d/hadoop.sh fi "$HADOOP_BIN/hdfs" dfsadmin -report ' 2>&1 | grep "Live datanodes" | awk '{print $3}') print_success "HDFS状态正常,活跃DataNode数: $datanodes_count" validation_passed=$((validation_passed + 1)) else print_error "HDFS状态检查失败" fi # 测试3: YARN状态检查 print_info "测试3: YARN状态检查..." sudo -u "$HADOOP_USER" HADOOP_BIN="$hadoop_bin" bash -c ' # 加载hadoop用户的环境 if [ -f ~/.bashrc ]; then source ~/.bashrc fi if [ -f /etc/profile.d/hadoop.sh ]; then source /etc/profile.d/hadoop.sh fi "$HADOOP_BIN/yarn" node -list ' 2>&1 | grep -q "Total Nodes" local yarn_status_result=$? if [ $yarn_status_result -eq 0 ]; then local yarn_nodes_count=$(sudo -u "$HADOOP_USER" HADOOP_BIN="$hadoop_bin" bash -c ' # 加载hadoop用户的环境 if [ -f ~/.bashrc ]; then source ~/.bashrc fi if [ -f /etc/profile.d/hadoop.sh ]; then source /etc/profile.d/hadoop.sh fi "$HADOOP_BIN/yarn" node -list ' 2>&1 | grep "Total Nodes" | awk '{print $3}') print_success "YARN状态正常,节点数: $yarn_nodes_count" validation_passed=$((validation_passed + 1)) else print_error "YARN状态检查失败" fi # 测试4: MapReduce示例作业 print_info "测试4: MapReduce示例作业..." local test_output="/test-validation/output-$(date +%s)" sudo -u "$HADOOP_USER" HADOOP_HOME="$HADOOP_HOME" HADOOP_BIN="$hadoop_bin" HADOOP_VERSION="$HADOOP_VERSION" TEST_OUTPUT="$test_output" bash -c ' # 加载hadoop用户的环境 if [ -f ~/.bashrc ]; then source ~/.bashrc fi if [ -f /etc/profile.d/hadoop.sh ]; then source /etc/profile.d/hadoop.sh fi echo -e "Hadoop\nCluster\nValidation\nTest" | "$HADOOP_BIN/hdfs" dfs -put - /test-validation/test-input.txt sleep 3 echo "HADOOP_HOME: $HADOOP_HOME" echo "HADOOP_CLASSPATH: $HADOOP_CLASSPATH" echo "HADOOP_MAPRED_HOME: $HADOOP_MAPRED_HOME" "$HADOOP_BIN/hadoop" jar "$HADOOP_HOME/share/hadoop/mapreduce/hadoop-mapreduce-examples-$HADOOP_VERSION.jar" wordcount /test-validation/test-input.txt "$TEST_OUTPUT" ' 2>&1 | tail -50 local mapreduce_test_result=$? if [ $mapreduce_test_result -eq 0 ]; then print_success "MapReduce示例作业测试通过" validation_passed=$((validation_passed + 1)) else print_warning "MapReduce示例作业测试可能有警告" fi # 测试5: Web UI访问检查 print_info "测试5: Web UI服务检查..." if check_http_service "http://$NAMENODE_NODE:9870" "NameNode" 15; then print_success "Web UI访问检查测试通过" validation_passed=$((validation_passed + 1)) fi # 清理测试数据 sudo -u "$HADOOP_USER" HADOOP_BIN="$hadoop_bin" bash -c ' # 加载hadoop用户的环境 if [ -f ~/.bashrc ]; then source ~/.bashrc fi if [ -f /etc/profile.d/hadoop.sh ]; then source /etc/profile.d/hadoop.sh fi "$HADOOP_BIN/hdfs" dfs -rm -r -f /test-validation' if [ $validation_passed -eq $validation_total ]; then print_success "集群验证完成,所有测试通过 ($validation_passed/$validation_total)" else print_warning "集群验证完成,部分测试通过 ($validation_passed/$validation_total)" fi print_step_complete } # 验证HTTP服务是否可访问 check_http_service() { local url="$1" local service_name="$2" local timeout="${3:-10}" # 默认10秒超时 print_info "检查 $service_name 服务 ($url)..." # 使用curl检查 if curl -s -f --max-time "$timeout" --head "$url" > /dev/null 2>&1; then print_success "$service_name 可访问" return 0 else print_warning "$service_name 不可访问" return 1 fi } # 模块15:访问信息显示 module_access_info() { print_step "访问信息" "显示集群访问信息" # 获取节点IP get_ip() { local node=$1 sudo -u $HADOOP_USER ssh "$HADOOP_USER@$node" "hostname -I | awk '{print \$1}' 2>/dev/null || echo '$node'" } local namenode_ip=$(get_ip "$NAMENODE_NODE") local resourcemanager_ip=$(get_ip "$RESOURCEMANAGER_NODE") local secondary_ip=$(get_ip "$SECONDARY_NODE") local jobhistory_ip=$(get_ip "$JOBHISTORY_NODE") # 显示访问信息 echo -e "\n${GREEN}${BOLD}══════════════════════════════════════════════════════════${NC}" echo -e "${GREEN}${BOLD} Hadoop集群部署完成!${NC}" echo -e "${GREEN}${BOLD}══════════════════════════════════════════════════════════${NC}" echo -e "\n${CYAN}${BOLD}📡 Web UI访问地址:${NC}" echo -e " ${WHITE}NameNode:${NC} ${GREEN}http://${namenode_ip}:9870${NC}" echo -e " ${WHITE}ResourceManager:${NC} ${GREEN}http://${resourcemanager_ip}:8088${NC}" echo -e " ${WHITE}SecondaryNameNode:${NC} ${GREEN}http://${secondary_ip}:9868${NC}" echo -e " ${WHITE}JobHistory Server:${NC} ${GREEN}http://${jobhistory_ip}:19888${NC}" echo -e "\n${CYAN}${BOLD}🔧 服务端点:${NC}" echo -e " ${WHITE}HDFS RPC:${NC} ${GREEN}hdfs://${NAMENODE_NODE}:9820${NC}" echo -e " ${WHITE}DataNode Web UI:${NC}" for node in $ALL_NODES; do local node_ip=$(get_ip "$node") echo -e " - $node: ${GREEN}http://${node_ip}:9864${NC}" done echo -e "\n${CYAN}${BOLD}💻 命令行工具:${NC}" echo -e " ${WHITE}HDFS操作:${NC} ${GREEN}hdfs dfs -ls /${NC}" echo -e " ${WHITE}YARN应用列表:${NC} ${GREEN}yarn application -list${NC}" echo -e " ${WHITE}集群状态:${NC} ${GREEN}hdfs dfsadmin -report${NC}" echo -e "\n${CYAN}${BOLD}📊 集群信息:${NC}" echo -e " ${WHITE}集群名称:${NC} ${YELLOW}$CLUSTER_NAME${NC}" echo -e " ${WHITE}Hadoop版本:${NC} ${YELLOW}$HADOOP_VERSION${NC}" echo -e " ${WHITE}运行用户:${NC} ${YELLOW}$HADOOP_USER${NC}" echo -e " ${WHITE}数据目录:${NC} ${YELLOW}$DATA_DIR${NC}" echo -e "\n${YELLOW}${BOLD}📋 下一步建议:${NC}" echo -e " 1. 将访问地址添加到书签" echo -e " 2. 检查防火墙确保端口可访问" echo -e " 3. 运行测试作业验证集群功能" echo -e " 4. 配置监控和告警" echo -e "\n${GREEN}${BOLD}══════════════════════════════════════════════════════════${NC}" # 保存配置到文件 cat > "$CONFIG_FILE" << EOF # Hadoop集群配置备份 - $(date) CLUSTER_NAME="$CLUSTER_NAME" HADOOP_VERSION="$HADOOP_VERSION" HADOOP_USER="$HADOOP_USER" HADOOP_HOME="$HADOOP_HOME" JAVA_HOME="$JAVA_HOME" # 节点配置 MASTER_NODE="$MASTER_NODE" WORKER_NODES="$WORKER_NODES" ALL_NODES="$ALL_NODES" # 服务分配 NAMENODE_NODE="$NAMENODE_NODE" RESOURCEMANAGER_NODE="$RESOURCEMANAGER_NODE" SECONDARY_NODE="$SECONDARY_NODE" JOBHISTORY_NODE="$JOBHISTORY_NODE" # 访问信息 NAMENODE_WEB="http://$namenode_ip:9870" RESOURCEMANAGER_WEB="http://$resourcemanager_ip:8088" HDFS_RPC="hdfs://$NAMENODE_NODE:9820" # 日志文件 SETUP_LOG="$LOG_FILE" EOF print_info "配置已保存到: $CONFIG_FILE" print_step_complete } # 模块:启动集群服务 module_start_cluster() { print_step "启动集群服务" "启动Hadoop集群的所有服务" # 获取当前主机名 local current_host=$(hostname) print_info "当前主机: $current_host" print_info "NameNode节点: $NAMENODE_NODE" print_info "SecondaryNameNode节点: $SECONDARY_NODE" print_info "ResourceManager节点: $RESOURCEMANAGER_NODE" print_info "JobHistory节点: $JOBHISTORY_NODE" # 确保使用完整路径 local hadoop_sbin="$HADOOP_HOME/sbin" local hadoop_bin="$HADOOP_HOME/bin" # 清理旧的PID文件(避免启动失败) print_info "清理旧的PID文件..." for node in $ALL_NODES; do print_info "清理节点: $node" if [ "$node" = "$(hostname)" ]; then # 本地节点 sudo rm -f /tmp/hadoop-*.pid /tmp/hadoop-hadoop-*.pid 2>/dev/null || true else # 远程节点 sudo -u "$HADOOP_USER" ssh "$HADOOP_USER@$node" "sudo rm -f /tmp/hadoop-*.pid /tmp/hadoop-hadoop-*.pid 2>/dev/null" || true fi done # 启动HDFS 102 print_info "启动HDFS服务..." if [ -f "$hadoop_sbin/start-dfs.sh" ]; then sudo -u "$HADOOP_USER" ssh "$HADOOP_USER@$NAMENODE_NODE" << 'EOF' # 加载hadoop用户的环境 if [ -f ~/.bashrc ]; then source ~/.bashrc fi if [ -f /etc/profile.d/hadoop.sh ]; then source /etc/profile.d/hadoop.sh fi "$HADOOP_HOME"/sbin/start-dfs.sh EOF print_success "HDFS启动命令已执行" else print_error "找不到 start-dfs.sh" return 1 fi # 启动YARN 103 print_info "启动YARN服务..." if [ -f "$hadoop_sbin/start-yarn.sh" ]; then sudo -u "$HADOOP_USER" ssh "$HADOOP_USER@$RESOURCEMANAGER_NODE" << 'EOF' # 加载hadoop用户的环境 if [ -f ~/.bashrc ]; then source ~/.bashrc fi if [ -f /etc/profile.d/hadoop.sh ]; then source /etc/profile.d/hadoop.sh fi $HADOOP_HOME/sbin/start-yarn.sh EOF print_success "YARN启动命令已执行" else print_error "找不到 start-yarn.sh" return 1 fi # 启动JobHistory Server print_info "启动JobHistory Server..." if [ -f "$hadoop_bin/mapred" ]; then sudo -u "$HADOOP_USER" ssh "$HADOOP_USER@$RESOURCEMANAGER_NODE" << 'EOF' # 加载hadoop用户的环境 if [ -f ~/.bashrc ]; then source ~/.bashrc fi if [ -f /etc/profile.d/hadoop.sh ]; then source /etc/profile.d/hadoop.sh fi $HADOOP_HOME/bin/mapred --daemon start historyserver EOF print_success "JobHistory Server启动命令已执行" else print_error "找不到 mapred 命令" fi # 等待服务启动 print_info "等待服务启动..." sleep 10 print_step_complete } # 手动启动失败的服务 manual_start_failed_services() { echo -e "\n${YELLOW}${BOLD}🔄 尝试手动启动失败的服务...${NC}" # 确保使用完整路径 local hadoop_sbin="$HADOOP_HOME/sbin" local hadoop_bin="$HADOOP_HOME/bin" # 启动NameNode(如果失败) local nn_pid=$(sudo -u "$HADOOP_USER" ssh "$HADOOP_USER@$NAMENODE_NODE" "jps | grep -i namenode | awk '{print \$1}'" 2>/dev/null || echo "") if [ -z "$nn_pid" ]; then print_info "手动启动NameNode (节点: $NAMENODE_NODE)..." sudo -u "$HADOOP_USER" ssh "$HADOOP_USER@$NAMENODE_NODE" << 'EOF' # 加载hadoop用户的环境 if [ -f ~/.bashrc ]; then source ~/.bashrc fi if [ -f /etc/profile.d/hadoop.sh ]; then source /etc/profile.d/hadoop.sh fi echo "停止可能残留的NameNode进程..." pkill -f "NameNode" 2>/dev/null || true sleep 2 echo "清理PID文件..." rm -f /tmp/hadoop-*-namenode.pid /tmp/hadoop-hadoop-namenode.pid 2>/dev/null || true echo "启动NameNode..." "$HADOOP_HOMe/bin/hdfs" --daemon start namenode echo "等待5秒..." sleep 5 echo "检查启动结果:" jps | grep -i namenode || echo "NameNode启动失败" EOF 2>/dev/null || print_warning "NameNode启动失败" fi # 启动ResourceManager(如果失败) local rm_pid=$(sudo -u "$HADOOP_USER" ssh "$HADOOP_USER@$RESOURCEMANAGER_NODE" "jps | grep -i resourcemanager | awk '{print \$1}'" 2>/dev/null || echo "") if [ -z "$rm_pid" ]; then print_info "手动启动ResourceManager (节点: $RESOURCEMANAGER_NODE)..." sudo -u "$HADOOP_USER" ssh "$HADOOP_USER@$RESOURCEMANAGER_NODE" << 'EOF' # 加载hadoop用户的环境 if [ -f ~/.bashrc ]; then source ~/.bashrc fi if [ -f /etc/profile.d/hadoop.sh ]; then source /etc/profile.d/hadoop.sh fi echo "停止可能残留的ResourceManager进程..." pkill -f "ResourceManager" 2>/dev/null || true sleep 2 echo "清理PID文件..." rm -f /tmp/hadoop-*-resourcemanager.pid /tmp/hadoop-hadoop-resourcemanager.pid 2>/dev/null || true echo "启动ResourceManager..." "$HADOOP_HOME/bin/yarn" --daemon start resourcemanager echo "等待5秒..." sleep 5 echo "检查启动结果:" jps | grep -i resourcemanager || echo "ResourceManager启动失败" EOF 2>/dev/null || print_warning "ResourceManager启动失败" fi # 启动SecondaryNameNode local snn_pid=$(sudo -u "$HADOOP_USER" ssh "$HADOOP_USER@$SECONDARY_NODE" "jps | grep -i secondarynamenode | awk '{print \$1}'" 2>/dev/null || echo "") if [ -z "$snn_pid" ]; then print_info "手动启动SecondaryNameNode (节点: $SECONDARY_NODE)..." sudo -u "$HADOOP_USER" ssh "$HADOOP_USER@$SECONDARY_NODE" << 'EOF' # 加载hadoop用户的环境 if [ -f ~/.bashrc ]; then source ~/.bashrc fi if [ -f /etc/profile.d/hadoop.sh ]; then source /etc/profile.d/hadoop.sh fi echo "停止可能残留的SecondaryNameNode进程..." pkill -f "SecondaryNameNode" 2>/dev/null || true sleep 2 echo "清理PID文件..." rm -f /tmp/hadoop-*-secondarynamenode.pid /tmp/hadoop-hadoop-secondarynamenode.pid 2>/dev/null || true echo "启动SecondaryNameNode..." "$HADOOP_HOME/bin/hdfs" --daemon start secondarynamenode echo "等待5秒..." sleep 5 echo "检查启动结果:" jps | grep -i secondarynamenode || echo "SecondaryNameNode启动失败" EOF 2>/dev/null || print_warning "SecondaryNameNode启动失败" fi # 启动JobHistory Server(如果失败) local jhs_pid=$(sudo -u "$HADOOP_USER" ssh "$HADOOP_USER@$JOBHISTORY_NODE" "jps | grep -i jobhistoryserver | awk '{print \$1}'" 2>/dev/null || echo "") if [ -z "$jhs_pid" ]; then print_info "手动启动JobHistory Server (节点: $JOBHISTORY_NODE)..." sudo -u "$HADOOP_USER" ssh "$HADOOP_USER@$JOBHISTORY_NODE" << 'EOF' # 加载hadoop用户的环境 if [ -f ~/.bashrc ]; then source ~/.bashrc fi if [ -f /etc/profile.d/hadoop.sh ]; then source /etc/profile.d/hadoop.sh fi echo "停止可能残留的JobHistoryServer进程..." pkill -f "JobHistoryServer" 2>/dev/null || true sleep 2 echo "清理PID文件..." rm -f /tmp/hadoop-*-jobhistoryserver.pid /tmp/hadoop-hadoop-jobhistoryserver.pid 2>/dev/null || true echo "启动JobHistoryServer..." "$HADOOP_HOME/bin/mapred" --daemon start historyserver echo "等待5秒..." sleep 5 echo "检查启动结果:" jps | grep -i jobhistoryserver || echo "JobHistoryServer启动失败" EOF 2>/dev/null || print_warning "JobHistoryServer启动失败" fi # 启动缺失的DataNode如果失败) for node in $ALL_NODES; do local dn_pid=$(sudo -u "$HADOOP_USER" ssh "$HADOOP_USER@$node" "jps | grep -i datanode | awk '{print \$1}'" 2>/dev/null || echo "") if [ -z "$dn_pid" ]; then print_info "手动启动DataNode (节点: $node)..." sudo -u "$HADOOP_USER" ssh "$HADOOP_USER@$node" << 'EOF' # 加载hadoop用户的环境 if [ -f ~/.bashrc ]; then source ~/.bashrc fi if [ -f /etc/profile.d/hadoop.sh ]; then source /etc/profile.d/hadoop.sh fi echo "停止可能残留的DataNode进程..." pkill -f "DataNode" 2>/dev/null || true sleep 2 echo "清理PID文件..." rm -f /tmp/hadoop-*-datanode.pid /tmp/hadoop-hadoop-datanode.pid 2>/dev/null || true echo "检查数据目录..." if [ ! -d "$DATA_DIR/hdfs/data" ]; then echo "创建数据目录..." sudo mkdir -p "$DATA_DIR/hdfs/data" sudo chown -R $HADOOP_USER:$HADOOP_GROUP "$DATA_DIR/hdfs/data" sudo chmod -R 755 "$DATA_DIR/hdfs/data" fi echo "启动DataNode..." "$HADOOP_HOME/bin/hdfs" --daemon start datanode echo "等待5秒..." sleep 5 echo "检查启动结果:" jps | grep -i datanode || echo "DataNode启动失败" EOF 2>/dev/null || print_warning "$node节点DataNode启动失败" fi done # 等待服务启动 sleep 10 } # 诊断Hadoop配置问题 # 诊断Hadoop配置问题(返回诊断结果) diagnose_hadoop_issues() { local result="" # 检查JAVA_HOME设置 result+="${CYAN}${BOLD}1. 检查各节点JAVA_HOME设置:${NC}\n" for node in $ALL_NODES; do result+="\n${WHITE}节点 $node:${NC}\n" local java_home_check=$(sudo -u "$HADOOP_USER" ssh "$HADOOP_USER@$node" bash -c ' # 加载hadoop用户的环境 if [ -f ~/.bashrc ]; then source ~/.bashrc fi if [ -f /etc/profile.d/hadoop.sh ]; then source /etc/profile.d/hadoop.sh fi echo "JAVA_HOME: "\$JAVA_HOME command -v java 2>/dev/null || echo "Java命令未找到" ' 2>/dev/null) result+="$java_home_check\n" # 检查是否为空或未设置 if echo "$java_home_check" | grep -q "JAVA_HOME=$" || echo "$java_home_check" | grep -q "Java命令未找到"; then result+=" ${RED}✗ Java环境有问题${NC}\n" else result+=" ${GREEN}✓ Java环境正常${NC}\n" fi done # 检查数据目录 result+="\n${CYAN}${BOLD}2. 检查各节点数据目录:${NC}\n" for node in $ALL_NODES; do result+="\n${WHITE}节点 $node:${NC}\n" local data_dir_check=$(sudo -u "$HADOOP_USER" ssh "$HADOOP_USER@$node" " if [ -d '$DATA_DIR/hdfs' ]; then echo '数据目录存在' ls -la '$DATA_DIR/hdfs/' | head -5 else echo '数据目录不存在' fi " 2>/dev/null) result+="$data_dir_check\n" # 检查数据目录是否存在 if echo "$data_dir_check" | grep -q "数据目录不存在"; then result+=" ${RED}✗ 数据目录不存在${NC}\n" else result+=" ${GREEN}✓ 数据目录正常${NC}\n" fi done # 检查配置文件 result+="\n${CYAN}${BOLD}3. 检查Hadoop配置文件:${NC}\n" local conf_dir="$HADOOP_HOME/etc/hadoop" # core-site.xml local fs_default=$(grep -A1 "fs.defaultFS" "$conf_dir/core-site.xml" 2>/dev/null || echo "配置文件不存在") result+="\n${WHITE}core-site.xml (fs.defaultFS):${NC}\n$fs_default\n" # yarn-site.xml local rm_hostname=$(grep -A1 "yarn.resourcemanager.hostname" "$conf_dir/yarn-site.xml" 2>/dev/null || echo "配置文件不存在") result+="\n${WHITE}yarn-site.xml (yarn.resourcemanager.hostname):${NC}\n$rm_hostname\n" # hdfs-site.xml local secondary_addr=$(grep -A1 "dfs.namenode.secondary.http-address" "$conf_dir/hdfs-site.xml" 2>/dev/null || echo "配置文件不存在") result+="\n${WHITE}hdfs-site.xml (dfs.namenode.secondary.http-address):${NC}\n$secondary_addr\n" echo -e "$result" } # 或者使用更简洁的诊断函数,返回退出码 diagnose_and_validate() { echo -e "\n${CYAN}${BOLD}🔧 Hadoop配置诊断:${NC}" local critical_errors=0 # 检查Java环境 print_info "检查各节点Java环境..." for node in $ALL_NODES; do local java_check=$(sudo -u "$HADOOP_USER" ssh "$HADOOP_USER@$node" " if command -v java &> /dev/null; then if [ -n \"\$JAVA_HOME\" ]; then echo \"✓ Java环境正常 (JAVA_HOME: \$JAVA_HOME)\" exit 0 else echo \"✗ JAVA_HOME未设置\" exit 1 fi else echo \"✗ Java命令未找到\" exit 2 fi " 2>/dev/null) if [ $? -ne 0 ]; then echo -e " ${RED}$node: $java_check${NC}" critical_errors=$((critical_errors + 1)) else echo -e " ${GREEN}$node: $java_check${NC}" fi done # 检查数据目录 print_info "检查各节点数据目录..." for node in $ALL_NODES; do local data_dir_check=$(sudo -u "$HADOOP_USER" ssh "$HADOOP_USER@$node" " if [ -d '$DATA_DIR/hdfs/data' ] && [ -d '$DATA_DIR/hdfs/name' ]; then echo \"✓ 数据目录存在\" exit 0 else echo \"✗ 数据目录不存在\" exit 1 fi " 2>/dev/null) if [ $? -ne 0 ]; then echo -e " ${RED}$node: $data_dir_check${NC}" critical_errors=$((critical_errors + 1)) else echo -e " ${GREEN}$node: $data_dir_check${NC}" fi done # 检查Hadoop配置文件 print_info "检查Hadoop配置文件..." local conf_dir="$HADOOP_HOME/etc/hadoop" if [ ! -f "$conf_dir/core-site.xml" ]; then echo -e " ${RED}✗ core-site.xml不存在${NC}" critical_errors=$((critical_errors + 1)) else echo -e " ${GREEN}✓ core-site.xml存在${NC}" fi if [ ! -f "$conf_dir/yarn-site.xml" ]; then echo -e " ${RED}✗ yarn-site.xml不存在${NC}" critical_errors=$((critical_errors + 1)) else echo -e " ${GREEN}✓ yarn-site.xml存在${NC}" fi if [ ! -f "$conf_dir/hdfs-site.xml" ]; then echo -e " ${RED}✗ hdfs-site.xml不存在${NC}" critical_errors=$((critical_errors + 1)) else echo -e " ${GREEN}✓ hdfs-site.xml存在${NC}" fi # 显示诊断结果 echo -e "\n${CYAN}${BOLD}📊 诊断结果:${NC}" if [ $critical_errors -eq 0 ]; then echo -e "${GREEN}${BOLD}✅ 所有检查通过,可以启动集群${NC}" return 0 else echo -e "${RED}${BOLD}❌ 发现 $critical_errors 个严重问题${NC}" echo -e "${YELLOW}请修复上述问题后重新启动集群。${NC}" return 1 fi } # 修复集群问题函数 fix_cluster_issues() { echo -e "\n${BLUE}开始修复集群问题...${NC}" # 修复Java环境 print_info "修复Java环境..." for node in $ALL_NODES; do print_info "修复节点 $node 的Java环境..." sudo -u "$HADOOP_USER" ssh "$HADOOP_USER@$node" " # 检测Java安装路径 if command -v java &> /dev/null; then java_path=\$(which java) if [ -L \"\$java_path\" ]; then java_path=\$(readlink -f \"\$java_path\") fi detected_java_home=\$(dirname \"\$(dirname \"\$java_path\")\") # 设置JAVA_HOME if [ -d \"\$detected_java_home\" ]; then echo \"检测到Java安装路径: \$detected_java_home\" # 添加到.bashrc if ! grep -q \"JAVA_HOME=\" ~/.bashrc; then echo \"export JAVA_HOME=\\\"\$detected_java_home\\\"\" >> ~/.bashrc echo \"export PATH=\\\"\$JAVA_HOME/bin:\\\$PATH\\\"\" >> ~/.bashrc echo \"已添加到.bashrc\" fi # 立即生效 export JAVA_HOME=\"\$detected_java_home\" export PATH=\"\$JAVA_HOME/bin:\$PATH\" echo \"JAVA_HOME设置为: \$JAVA_HOME\" else echo \"无法确定JAVA_HOME\" fi else echo \"Java未安装\" fi " 2>/dev/null || print_warning "节点 $node Java环境修复失败" done # 修复数据目录 print_info "修复数据目录..." for node in $ALL_NODES; do print_info "修复节点 $node 的数据目录..." sudo -u "$HADOOP_USER" ssh "$HADOOP_USER@$node" " # 创建数据目录 sudo mkdir -p '$DATA_DIR/hdfs/data' sudo mkdir -p '$DATA_DIR/hdfs/name' sudo mkdir -p '$DATA_DIR/tmp' sudo mkdir -p '$DATA_DIR/yarn/local' sudo mkdir -p '$DATA_DIR/yarn/logs' # 设置权限 sudo chown -R '$HADOOP_USER:$HADOOP_GROUP' '$DATA_DIR' sudo chmod -R 755 '$DATA_DIR' echo \"数据目录创建完成: $DATA_DIR\" " 2>/dev/null || print_warning "节点 $node 数据目录修复失败" done print_success "修复完成,请重新启动集群" } # 检查服务启动失败的具体原因 check_service_logs() { local service="$1" local node="$2" echo -e "\n${YELLOW}${BOLD}🔍 检查 $service 日志 ($node):${NC}" case $service in "NameNode") local log_pattern="namenode" ;; "ResourceManager") local log_pattern="resourcemanager" ;; "DataNode") local log_pattern="datanode" ;; "SecondaryNameNode") local log_pattern="secondarynamenode" ;; "JobHistoryServer") local log_pattern="jobhistory" ;; *) local log_pattern="$service" ;; esac # 查找并显示最新的日志 sudo -u "$HADOOP_USER" ssh "$HADOOP_USER@$node" " echo '查找 $service 日志文件...' find '$LOG_DIR' -name '*$log_pattern*.log' -type f 2>/dev/null | head -3 echo '' echo '最新日志的最后20行:' find '$LOG_DIR' -name '*$log_pattern*.log' -type f 2>/dev/null | head -1 | xargs tail -20 2>/dev/null || echo '未找到日志文件' echo '' echo '检查是否有错误:' find '$LOG_DIR' -name '*$log_pattern*.log' -type f 2>/dev/null | head -1 | xargs grep -i 'error\|exception\|fatal\|failed' 2>/dev/null | head -5 || echo '未找到错误信息' " 2>/dev/null || echo "无法连接到节点 $node" } # 检查服务状态函数 check_service_status() { echo -e "\n${CYAN}${BOLD}🔍 服务状态检查:${NC}" local failed_services=() # 检查NameNode print_info "检查NameNode..." local nn_pid=$(sudo -u "$HADOOP_USER" ssh "$HADOOP_USER@$NAMENODE_NODE" "jps | grep -i namenode | awk '{print \$1}'" 2>/dev/null || echo "") if [ -n "$nn_pid" ]; then echo -e " ${GREEN}✓ NameNode${NC} - 运行中 (PID: $nn_pid, 节点: $NAMENODE_NODE)" else echo -e " ${RED}✗ NameNode${NC} - 未运行 (应在节点: $NAMENODE_NODE)" failed_services+=("NameNode@$NAMENODE_NODE") fi # 检查ResourceManager print_info "检查ResourceManager..." local rm_pid=$(sudo -u "$HADOOP_USER" ssh "$HADOOP_USER@$RESOURCEMANAGER_NODE" "jps | grep -i resourcemanager | awk '{print \$1}'" 2>/dev/null || echo "") if [ -n "$rm_pid" ]; then echo -e " ${GREEN}✓ ResourceManager${NC} - 运行中 (PID: $rm_pid, 节点: $RESOURCEMANAGER_NODE)" else echo -e " ${RED}✗ ResourceManager${NC} - 未运行 (应在节点: $RESOURCEMANAGER_NODE)" failed_services+=("ResourceManager@$RESOURCEMANAGER_NODE") fi # 检查JobHistory Server print_info "检查JobHistory Server..." local jhs_pid=$(sudo -u "$HADOOP_USER" ssh "$HADOOP_USER@$JOBHISTORY_NODE" "jps | grep -i jobhistoryserver | awk '{print \$1}'" 2>/dev/null || echo "") if [ -n "$jhs_pid" ]; then echo -e " ${GREEN}✓ JobHistory Server${NC} - 运行中 (PID: $jhs_pid, 节点: $JOBHISTORY_NODE)" else echo -e " ${RED}✗ JobHistory Server${NC} - 未运行 (应在节点: $JOBHISTORY_NODE)" failed_services+=("JobHistoryServer@$JOBHISTORY_NODE") fi # 检查SecondaryNameNode print_info "检查SecondaryNameNode..." local snn_pid=$(sudo -u "$HADOOP_USER" ssh "$HADOOP_USER@$SECONDARY_NODE" "jps | grep -i secondarynamenode | awk '{print \$1}'" 2>/dev/null || echo "") if [ -n "$snn_pid" ]; then echo -e " ${GREEN}✓ SecondaryNameNode${NC} - 运行中 (PID: $snn_pid, 节点: $SECONDARY_NODE)" else echo -e " ${RED}✗ SecondaryNameNode${NC} - 未运行 (应在节点: $SECONDARY_NODE)" failed_services+=("SecondaryNameNode@$SECONDARY_NODE") fi # 检查DataNode print_info "检查DataNode..." echo -e "\n${CYAN}${BOLD}📊 DataNode状态:${NC}" for node in $ALL_NODES; do local dn_pid=$(sudo -u "$HADOOP_USER" ssh "$HADOOP_USER@$node" "jps | grep -i datanode | awk '{print \$1}'" 2>/dev/null || echo "") if [ -n "$dn_pid" ]; then echo -e " ${GREEN}✓ $node${NC} - DataNode运行中 (PID: $dn_pid)" else echo -e " ${RED}✗ $node${NC} - DataNode未运行" failed_services+=("DataNode@$node") fi done # 检查NodeManager print_info "检查NodeManager..." echo -e "\n${CYAN}${BOLD}📊 NodeManager状态:${NC}" for node in $ALL_NODES; do local nm_pid=$(sudo -u "$HADOOP_USER" ssh "$HADOOP_USER@$node" "jps | grep -i nodemanager | awk '{print \$1}'" 2>/dev/null || echo "") if [ -n "$nm_pid" ]; then echo -e " ${GREEN}✓ $node${NC} - NodeManager运行中 (PID: $nm_pid)" else echo -e " ${RED}✗ $node${NC} - NodeManager未运行" failed_services+=("NodeManager@$node") fi done # 检查失败服务的日志 if [ ${#failed_services[@]} -gt 0 ]; then echo -e "\n${RED}${BOLD}🔴 以下服务启动失败:${NC}" for service_info in "${failed_services[@]}"; do local service=$(echo "$service_info" | cut -d'@' -f1) local node=$(echo "$service_info" | cut -d'@' -f2) echo -e " ${RED}$service (节点: $node)${NC}" check_service_logs "$service" "$node" done fi return ${#failed_services[@]} } # 启动集群主函数 start_hadoop_cluster() { echo -e "\n${BLUE}开始启动Hadoop集群...${NC}" # 检查Hadoop是否安装 if [ ! -d "$HADOOP_HOME" ]; then print_error "Hadoop未安装,请先运行安装集群" return 1 fi echo -e "${BLUE}${BOLD}══════════════════════════════════════════════════════════${NC}" # 重置步骤计数器(重要!) STEP=0 TOTAL_STEPS=6 # 设置停止集群的总步骤数 ( # 步骤1:诊断配置 print_step "检查集群配置" "诊断可能的问题" local diagnose_result=$(diagnose_hadoop_issues) echo "$diagnose_result" # 分析诊断结果 local has_critical_issues=0 local issues_list=() # 检查Java环境 if echo "$diagnose_result" | grep -q "JAVA_HOME=$\|JAVA_HOME=$\|Java命令未找到"; then has_critical_issues=1 issues_list+=("Java环境问题") fi # 检查数据目录 if echo "$diagnose_result" | grep -q "数据目录不存在"; then has_critical_issues=1 issues_list+=("数据目录问题") fi # 如果有严重问题,终止启动 if [ $has_critical_issues -eq 1 ]; then echo -e "\n${RED}${BOLD}❌ 检测到严重问题,集群启动终止!${NC}" echo -e "${RED}发现以下问题:${NC}" for issue in "${issues_list[@]}"; do echo -e " ${RED}• $issue${NC}" done echo -e "\n${YELLOW}${BOLD}🔧 修复建议:${NC}" # Java环境修复建议 if [[ "${issues_list[*]}" =~ "Java环境问题" ]]; then echo -e "1. 修复Java环境:" echo -e " # 在 hadoop103 和 hadoop104 上执行:" echo -e " echo 'export JAVA_HOME=/usr/lib/jvm/java-11-openjdk' >> ~/.bashrc" echo -e " source ~/.bashrc" echo -e " # 验证:" echo -e " java -version" echo -e " echo \$JAVA_HOME" fi # 数据目录修复建议 if [[ "${issues_list[*]}" =~ "数据目录问题" ]]; then echo -e "\n2. 创建数据目录:" echo -e " # 在 hadoop103 和 hadoop104 上执行:" echo -e " sudo mkdir -p $DATA_DIR/hdfs/data" echo -e " sudo mkdir -p $DATA_DIR/hdfs/name" echo -e " sudo mkdir -p $DATA_DIR/tmp" echo -e " sudo mkdir -p $DATA_DIR/yarn/local" echo -e " sudo mkdir -p $DATA_DIR/yarn/logs" echo -e " sudo chown -R $HADOOP_USER:$HADOOP_GROUP $DATA_DIR" echo -e " sudo chmod -R 755 $DATA_DIR" fi echo -e "\n${YELLOW}修复完成后,请重新运行启动命令。${NC}" print_step_complete return 1 fi print_success "环境诊断通过" print_step_complete # 步骤2:启动集群服务 module_start_cluster # 步骤3:检查服务状态 print_step "检查服务状态" "检查NameNode、ResourceManager、JobHistory Server、SecondaryNameNode、DataNode、NodeManager" if check_service_status; then print_success "所有服务启动成功!" else print_warning "部分服务启动失败" fi print_step_complete # 步骤4:手动修复失败的服务 print_step "修复未启动服务" "尝试手动启动失败的服务" manual_start_failed_services print_step_complete # 步骤5:再次检查服务状态 print_step "最终状态检查" "验证所有服务是否正常启动" if check_service_status; then print_success "所有服务启动成功!" else print_warning "仍有部分服务启动失败" fi print_step_complete # 步骤6:测试集群 module_cluster_validate print_step_complete # 步骤7:显示访问信息 echo -e "\n${GREEN}${BOLD}🌐 Web UI访问地址:${NC}" echo -e " NameNode: ${GREEN}http://${NAMENODE_NODE}:9870${NC}" echo -e " ResourceManager: ${GREEN}http://${RESOURCEMANAGER_NODE}:8088${NC}" echo -e " JobHistory: ${GREEN}http://${JOBHISTORY_NODE}:19888${NC}" echo -e " SecondaryNameNode: ${GREEN}http://${SECONDARY_NODE}:9868${NC}" echo -e "\n${GREEN}${BOLD}✅ Hadoop集群启动完成!${NC}" ) 2>&1 | tee -a "$LOG_FILE" local exit_code=${PIPESTATUS[0]} return $exit_code } # 模块:停止集群服务 module_stop_cluster() { #print_step "停止集群服务" "停止Hadoop集群的所有服务" # 确保使用完整路径 local hadoop_sbin="$HADOOP_HOME/sbin" local hadoop_bin="$HADOOP_HOME/bin" # 停止JobHistory Server print_step "停止JobHistory Server" "停止JobHistory Server" if [ -f "$hadoop_bin/mapred" ]; then sudo -u "$HADOOP_USER" HADOOP_BIN="$hadoop_bin" bash -c ' # 加载hadoop用户的环境 if [ -f ~/.bashrc ]; then source ~/.bashrc fi if [ -f /etc/profile.d/hadoop.sh ]; then source /etc/profile.d/hadoop.sh fi "$HADOOP_BIN/mapred" --daemon stop historyserver' 2>/dev/null || true print_success "JobHistory Server停止命令已执行" else print_warning "找不到 mapred 命令" fi print_step_complete # 停止YARN 103 print_step "停止YARN服务" "停止ResourceManager和NodeManager" if [ -f "$hadoop_sbin/stop-yarn.sh" ]; then sudo -u "$HADOOP_USER" ssh "$HADOOP_USER@$RESOURCEMANAGER_NODE" << 'EOF' # 加载hadoop用户的环境 if [ -f ~/.bashrc ]; then source ~/.bashrc fi if [ -f /etc/profile.d/hadoop.sh ]; then source /etc/profile.d/hadoop.sh fi "$HADOOP_HOME/sbin/stop-yarn.sh" EOF 2>/dev/null || true print_success "YARN停止命令已执行" else print_warning "找不到 stop-yarn.sh,尝试其他方式停止" # 尝试手动停止 for node in $ALL_NODES; do sudo -u "$HADOOP_USER" ssh "$HADOOP_USER@$node" << 'EOF' # 加载hadoop用户的环境 if [ -f ~/.bashrc ]; then source ~/.bashrc fi if [ -f /etc/profile.d/hadoop.sh ]; then source /etc/profile.d/hadoop.sh fi "$HADOOP_HOME/bin/yarn" --daemon stop nodemanager EOF 2>/dev/null || true 2>/dev/null || true done # 停止ResourceManager sudo -u "$HADOOP_USER" ssh "$HADOOP_USER@$RESOURCEMANAGER_NODE" << 'EOF' # 加载hadoop用户的环境 if [ -f ~/.bashrc ]; then source ~/.bashrc fi if [ -f /etc/profile.d/hadoop.sh ]; then source /etc/profile.d/hadoop.sh fi "$HADOOP_HOME/bin/yarn" --daemon stop resourcemanager EOF 2>/dev/null || true 2>/dev/null || true fi print_step_complete # 停止HDFS 102 print_step "停止HDFS服务" "停止NameNode、DataNode和SecondaryNameNode" if [ -f "$hadoop_sbin/stop-dfs.sh" ]; then sudo -u "$HADOOP_USER" ssh "$HADOOP_USER@$NAMENODE_NODE" << 'EOF' # 加载hadoop用户的环境 if [ -f ~/.bashrc ]; then source ~/.bashrc fi if [ -f /etc/profile.d/hadoop.sh ]; then source /etc/profile.d/hadoop.sh fi "$HADOOP_HOME/sbin/stop-dfs.sh" EOF 2>/dev/null || true print_success "HDFS停止命令已执行" else print_warning "找不到 stop-dfs.sh,尝试其他方式停止" # 尝试手动停止 for node in $ALL_NODES; do sudo -u "$HADOOP_USER" ssh "$HADOOP_USER@$node" << 'EOF' # 加载hadoop用户的环境 if [ -f ~/.bashrc ]; then source ~/.bashrc fi if [ -f /etc/profile.d/hadoop.sh ]; then source /etc/profile.d/hadoop.sh fi "$HADOOP_HOME/bin/hdfs" --daemon stop datanode EOF 2>/dev/null || true 2>/dev/null || true done # 停止NameNode sudo -u "$HADOOP_USER" ssh "$HADOOP_USER@$NAMENODE_NODE" << 'EOF' # 加载hadoop用户的环境 if [ -f ~/.bashrc ]; then source ~/.bashrc fi if [ -f /etc/profile.d/hadoop.sh ]; then source /etc/profile.d/hadoop.sh fi "$HADOOP_HOME/bin/hdfs" --daemon stop namenode EOF 2>/dev/null || true 2>/dev/null || true # 停止SecondaryNameNode if [ -n "$SECONDARY_NODE" ]; then sudo -u "$HADOOP_USER" ssh "$HADOOP_USER@$SECONDARY_NODE" << 'EOF' # 加载hadoop用户的环境 if [ -f ~/.bashrc ]; then source ~/.bashrc fi if [ -f /etc/profile.d/hadoop.sh ]; then source /etc/profile.d/hadoop.sh fi "$HADOOP_HOME/bin/hdfs" --daemon stop secondarynamenode EOF 2>/dev/null || true 2>/dev/null || true fi fi # 等待进程停止 sleep 10 # 检查并停止各节点的服务 print_info "清理残留进程..." for node in $ALL_NODES; do print_info "清理节点: $node" timeout 10s "$HADOOP_USER" ssh "$HADOOP_USER@$node" " # 杀死Hadoop相关进程 #pkill -u $HADOOP_USER -f 'NameNode|DataNode|SecondaryNameNode|ResourceManager|NodeManager|JobHistoryServer' 2>/dev/null || true pkill -9 -u $HADOOP_USER -f 'NameNode|DataNode|SecondaryNameNode|ResourceManager|NodeManager|JobHistoryServer' 2>/dev/null || true # 等待 sleep 5 # 强制杀死残留进程 pkill -9 -u $HADOOP_USER -f 'hadoop|yarn|hdfs' 2>/dev/null || true # 清理PID文件 rm -f /tmp/hadoop-*.pid /tmp/hadoop-hadoop-*.pid 2>/dev/null || true rm -f /tmp/*hadoop*.pid /tmp/*yarn*.pid 2>/dev/null || true echo '清理完成' " 2>/dev/null || print_warning "节点 $node 清理时出现警告" done # 等待所有进程停止 sleep 3 # 检查停止结果 check_stop_result print_step_complete } # 检查停止结果 check_stop_result() { echo -e "\n${CYAN}${BOLD}🔍 停止结果检查:${NC}" local running_services=0 # 检查NameNode if pgrep -f "NameNode" > /dev/null; then echo -e " ${RED}✗ NameNode${NC} - 仍然在运行" running_services=$((running_services + 1)) else echo -e " ${GREEN}✓ NameNode${NC} - 已停止" fi # 检查ResourceManager if pgrep -f "ResourceManager" > /dev/null; then echo -e " ${RED}✗ ResourceManager${NC} - 仍然在运行" running_services=$((running_services + 1)) else echo -e " ${GREEN}✓ ResourceManager${NC} - 已停止" fi # 检查JobHistory Server if pgrep -f "JobHistoryServer" > /dev/null; then echo -e " ${RED}✗ JobHistory Server${NC} - 仍然在运行" running_services=$((running_services + 1)) else echo -e " ${GREEN}✓ JobHistory Server${NC} - 已停止" fi # 检查各节点的DataNode和NodeManager for node in $ALL_NODES; do local dn_pid=$(sudo -u "$HADOOP_USER" ssh "$HADOOP_USER@$node" "pgrep -f 'DataNode'" 2>/dev/null || echo "") local nm_pid=$(sudo -u "$HADOOP_USER" ssh "$HADOOP_USER@$node" "pgrep -f 'NodeManager'" 2>/dev/null || echo "") if [ -n "$dn_pid" ]; then echo -e " ${RED}$node DataNode${NC} - 仍然在运行" running_services=$((running_services + 1)) fi if [ -n "$nm_pid" ]; then echo -e " ${RED}$node NodeManager${NC} - 仍然在运行" running_services=$((running_services + 1)) fi done if [ $running_services -eq 0 ]; then echo -e "\n${GREEN}${BOLD}✅ 所有Hadoop服务已成功停止!${NC}" else echo -e "\n${YELLOW}${BOLD}⚠️ 仍有 $running_services 个服务在运行${NC}" echo -e "${YELLOW}可以尝试强制停止:${NC}" echo -e " sudo pkill -9 -u $HADOOP_USER -f 'hadoop|yarn|hdfs'" fi } # 停止集群主函数 stop_hadoop_cluster() { echo -e "\n${BLUE}开始停止Hadoop集群...${NC}" # 检查Hadoop是否安装 if [ ! -d "$HADOOP_HOME" ]; then print_error "Hadoop未安装,请先运行安装集群" return 1 fi # 确认停止 echo -e "${YELLOW}${BOLD}确认要停止Hadoop集群吗?(y/n): ${NC}\c" read -r confirm_stop if [[ ! "$confirm_stop" =~ ^[Yy]$ ]]; then print_info "停止操作已取消" return 0 fi echo -e "${BLUE}${BOLD}══════════════════════════════════════════════════════════${NC}" # 重置步骤计数器(重要!) STEP=0 TOTAL_STEPS=3 # 设置停止集群的总步骤数 ( # 停止集群服务 module_stop_cluster ) 2>&1 | tee -a "$LOG_FILE" } # ==================== 主执行流程 ==================== # 修改主函数 main() { # 显示主菜单 show_main_menu case $MODE in "install") # 原有的安装流程 echo -e "\n${GREEN}检测到以下配置:${NC}" echo -e " 集群节点: ${CYAN}$ALL_NODES${NC}" echo -e " Hadoop版本: ${CYAN}$HADOOP_VERSION${NC}" echo -e " 运行用户: ${CYAN}$HADOOP_USER${NC}" echo -e " 安装目录: ${CYAN}$HADOOP_HOME${NC}" echo -e "\n${GREEN}温馨提示:${CYAN}如果Hadoop安装包下载过慢,可以离线下载后,上传到/tmp目录下${NC}" echo -e "\n${YELLOW}是否继续安装?(y/n): ${NC}\c" read -r confirm_install if [[ ! "$confirm_install" =~ ^[Yy]$ ]]; then echo -e "${RED}安装已取消。${NC}" exit 0 fi # 创建日志目录 mkdir -p "$(dirname "$LOG_FILE")" touch "$LOG_FILE" echo -e "\n${BLUE}开始执行Hadoop集群安装...${NC}" echo -e "${DIM}详细日志将保存到: $LOG_FILE${NC}" echo -e "${BLUE}${BOLD}══════════════════════════════════════════════════════════${NC}" # 记录开始时间 START_TIME=$(date +%s) # 执行安装流程(原有的安装代码) ( # 验证配置 validate_config # 执行各个模块 safe_execute "系统准备" module_system_prepare safe_execute "用户设置" module_user_setup safe_execute "SSH配置" module_ssh_setup safe_execute "Java安装" module_java_install safe_execute "目录设置" module_directory_setup safe_execute "Hadoop安装" module_hadoop_install safe_execute "配置模板" module_config_templates safe_execute "配置渲染" module_config_render safe_execute "配置分发" module_config_distribute safe_execute "环境设置" module_environment_setup safe_execute "HDFS初始化" module_hdfs_init safe_execute "防火墙配置" module_firewall_setup safe_execute "集群启动" module_cluster_start safe_execute "集群验证" module_cluster_validate safe_execute "访问信息" module_access_info # 显示完成信息 echo -e "\n${GREEN}${BOLD}✨ Hadoop集群安装完成!${NC}" echo -e "${GREEN}请查看上面的访问信息使用集群。${NC}" ) 2>&1 | tee -a "$LOG_FILE" || { echo "警告:日志记录可能不完整,但安装过程继续..." >&2 } # 检查执行状态 if [ ${PIPESTATUS[0]} -ne 0 ]; then echo -e "\n${RED}${BOLD}安装过程中出现错误!${NC}" echo -e "${RED}请检查日志文件: $LOG_FILE${NC}" exit 1 fi # 显示日志文件位置 echo -e "\n${BLUE}详细安装日志: $LOG_FILE${NC}" echo -e "${BLUE}集群配置文件: $CONFIG_FILE${NC}" echo -e "\n${GREEN}${BOLD}🎉 安装完成!${NC}" ;; "uninstall") # 执行卸载流程 uninstall_hadoop_cluster ;; "status") # 检查集群状态 check_cluster_status ;; "start") # 启动集群 start_hadoop_cluster ;; "stop") # 停止集群 stop_hadoop_cluster ;; "fix") # 修复集群问题 fix_cluster_issues ;; esac # 显示日志文件位置 if [ -f "$LOG_FILE" ]; then echo -e "\n${BLUE}详细操作日志: $LOG_FILE${NC}" fi # 询问是否返回主菜单 echo -e "\n${YELLOW}是否返回主菜单?(y/n): ${NC}\c" read -r return_to_menu if [[ "$return_to_menu" =~ ^[Yy]$ ]]; then main else echo -e "${GREEN}退出脚本。${NC}" exit 0 fi } # ==================== 执行主函数 ==================== main

本文作者:widdo

本文链接:

版权声明:本博客所有文章除特别声明外,均采用 BY-NC-SA 许可协议。转载请注明出处!