nixos/hadoop: fix yarn, add more service configuration options

main
illustris 3 years ago committed by Raphael Megzari
parent ee1fd49ebe
commit 91bb2b7016
  1. 34
      nixos/modules/services/cluster/hadoop/conf.nix
  2. 80
      nixos/modules/services/cluster/hadoop/default.nix
  3. 81
      nixos/modules/services/cluster/hadoop/hdfs.nix
  4. 107
      nixos/modules/services/cluster/hadoop/yarn.nix

@ -1,4 +1,4 @@
{ hadoop, pkgs }:
{ cfg, pkgs, lib }:
let
propertyXml = name: value: ''
<property>
@ -13,19 +13,31 @@ let
${builtins.concatStringsSep "\n" (pkgs.lib.mapAttrsToList propertyXml properties)}
</configuration>
'';
cfgLine = name: value: ''
${name}=${builtins.toString value}
'';
cfgFile = fileName: properties: pkgs.writeTextDir fileName ''
# generated by NixOS
${builtins.concatStringsSep "" (pkgs.lib.mapAttrsToList cfgLine properties)}
'';
userFunctions = ''
hadoop_verify_logdir() {
echo Skipping verification of log directory
}
'';
hadoopEnv = ''
export HADOOP_LOG_DIR=/tmp/hadoop/$USER
'';
in
pkgs.buildEnv {
name = "hadoop-conf";
paths = [
(siteXml "core-site.xml" hadoop.coreSite)
(siteXml "hdfs-site.xml" hadoop.hdfsSite)
(siteXml "mapred-site.xml" hadoop.mapredSite)
(siteXml "yarn-site.xml" hadoop.yarnSite)
(pkgs.writeTextDir "hadoop-user-functions.sh" userFunctions)
];
}
pkgs.runCommand "hadoop-conf" {} ''
mkdir -p $out/
cp ${siteXml "core-site.xml" cfg.coreSite}/* $out/
cp ${siteXml "hdfs-site.xml" cfg.hdfsSite}/* $out/
cp ${siteXml "mapred-site.xml" cfg.mapredSite}/* $out/
cp ${siteXml "yarn-site.xml" cfg.yarnSite}/* $out/
cp ${cfgFile "container-executor.cfg" cfg.containerExecutorCfg}/* $out/
cp ${pkgs.writeTextDir "hadoop-user-functions.sh" userFunctions}/* $out/
cp ${pkgs.writeTextDir "hadoop-env.sh" hadoopEnv}/* $out/
cp ${cfg.log4jProperties} $out/log4j.properties
${lib.concatMapStringsSep "\n" (dir: "cp -r ${dir}/* $out/") cfg.extraConfDirs}
''

@ -1,5 +1,7 @@
{ config, lib, pkgs, ...}:
let
cfg = config.services.hadoop;
in
with lib;
{
imports = [ ./yarn.nix ./hdfs.nix ];
@ -17,7 +19,9 @@ with lib;
};
hdfsSite = mkOption {
default = {};
default = {
"dfs.namenode.rpc-bind-host" = "0.0.0.0";
};
type = types.attrsOf types.anything;
example = literalExpression ''
{
@ -28,27 +32,81 @@ with lib;
};
mapredSite = mkOption {
default = {};
default = {
"mapreduce.framework.name" = "yarn";
"yarn.app.mapreduce.am.env" = "HADOOP_MAPRED_HOME=${cfg.package}/lib/${cfg.package.untarDir}";
"mapreduce.map.env" = "HADOOP_MAPRED_HOME=${cfg.package}/lib/${cfg.package.untarDir}";
"mapreduce.reduce.env" = "HADOOP_MAPRED_HOME=${cfg.package}/lib/${cfg.package.untarDir}";
};
type = types.attrsOf types.anything;
example = literalExpression ''
{
"mapreduce.map.cpu.vcores" = "1";
options.services.hadoop.mapredSite.default // {
"mapreduce.map.java.opts" = "-Xmx900m -XX:+UseParallelGC";
}
'';
description = "Hadoop mapred-site.xml definition";
};
yarnSite = mkOption {
default = {};
default = {
"yarn.nodemanager.admin-env" = "PATH=$PATH";
"yarn.nodemanager.aux-services" = "mapreduce_shuffle";
"yarn.nodemanager.aux-services.mapreduce_shuffle.class" = "org.apache.hadoop.mapred.ShuffleHandler";
"yarn.nodemanager.bind-host" = "0.0.0.0";
"yarn.nodemanager.container-executor.class" = "org.apache.hadoop.yarn.server.nodemanager.LinuxContainerExecutor";
"yarn.nodemanager.env-whitelist" = "JAVA_HOME,HADOOP_COMMON_HOME,HADOOP_HDFS_HOME,HADOOP_CONF_DIR,CLASSPATH_PREPEND_DISTCACHE,HADOOP_YARN_HOME,HADOOP_HOME,LANG,TZ";
"yarn.nodemanager.linux-container-executor.group" = "hadoop";
"yarn.nodemanager.linux-container-executor.path" = "/run/wrappers/yarn-nodemanager/bin/container-executor";
"yarn.nodemanager.log-dirs" = "/var/log/hadoop/yarn/nodemanager";
"yarn.resourcemanager.bind-host" = "0.0.0.0";
"yarn.resourcemanager.scheduler.class" = "org.apache.hadoop.yarn.server.resourcemanager.scheduler.fifo.FifoScheduler";
};
type = types.attrsOf types.anything;
example = literalExpression ''
{
"yarn.resourcemanager.ha.id" = "resourcemanager1";
options.services.hadoop.yarnSite.default // {
"yarn.resourcemanager.hostname" = "''${config.networking.hostName}";
}
'';
description = "Hadoop yarn-site.xml definition";
};
log4jProperties = mkOption {
default = "${cfg.package}/lib/${cfg.package.untarDir}/etc/hadoop/log4j.properties";
type = types.path;
example = literalExpression ''
"''${pkgs.hadoop}/lib/''${pkgs.hadoop.untarDir}/etc/hadoop/log4j.properties";
'';
description = "log4j.properties file added to HADOOP_CONF_DIR";
};
containerExecutorCfg = mkOption {
default = {
# must be the same as yarn.nodemanager.linux-container-executor.group in yarnSite
"yarn.nodemanager.linux-container-executor.group"="hadoop";
"min.user.id"=1000;
"feature.terminal.enabled"=1;
};
type = types.attrsOf types.anything;
example = literalExpression ''
options.services.hadoop.containerExecutorCfg.default // {
"feature.terminal.enabled" = 0;
}
'';
description = "Yarn container-executor.cfg definition";
};
extraConfDirs = mkOption {
default = [];
type = types.listOf types.path;
example = literalExpression ''
[
./extraHDFSConfs
./extraYARNConfs
]
'';
description = "Directories containing additional config files to be added to HADOOP_CONF_DIR";
};
package = mkOption {
type = types.package;
default = pkgs.hadoop;
@ -64,6 +122,12 @@ with lib;
users.groups.hadoop = {
gid = config.ids.gids.hadoop;
};
environment = {
systemPackages = [ cfg.package ];
etc."hadoop-conf".source = let
hadoopConf = "${import ./conf.nix { inherit cfg pkgs lib; }}/";
in "${hadoopConf}";
};
})
];

@ -1,24 +1,54 @@
{ config, lib, pkgs, ...}:
with lib;
let
cfg = config.services.hadoop;
hadoopConf = import ./conf.nix { hadoop = cfg; pkgs = pkgs; };
hadoopConf = "${import ./conf.nix { inherit cfg pkgs lib; }}/";
restartIfChanged = mkOption {
type = types.bool;
description = ''
Automatically restart the service on config change.
This can be set to false to defer restarts on clusters running critical applications.
Please consider the security implications of inadvertently running an older version,
and the possibility of unexpected behavior caused by inconsistent versions across a cluster when disabling this option.
'';
default = false;
};
in
with lib;
{
options.services.hadoop.hdfs = {
namenode.enabled = mkOption {
type = types.bool;
default = false;
description = ''
Whether to run the Hadoop YARN NameNode
'';
namenode = {
enabled = mkOption {
type = types.bool;
default = false;
description = ''
Whether to run the HDFS NameNode
'';
};
inherit restartIfChanged;
openFirewall = mkOption {
type = types.bool;
default = true;
description = ''
Open firewall ports for namenode
'';
};
};
datanode.enabled = mkOption {
type = types.bool;
default = false;
description = ''
Whether to run the Hadoop YARN DataNode
'';
datanode = {
enabled = mkOption {
type = types.bool;
default = false;
description = ''
Whether to run the HDFS DataNode
'';
};
inherit restartIfChanged;
openFirewall = mkOption {
type = types.bool;
default = true;
description = ''
Open firewall ports for datanode
'';
};
};
};
@ -27,10 +57,7 @@ with lib;
systemd.services.hdfs-namenode = {
description = "Hadoop HDFS NameNode";
wantedBy = [ "multi-user.target" ];
environment = {
HADOOP_HOME = "${cfg.package}";
};
inherit (cfg.hdfs.namenode) restartIfChanged;
preStart = ''
${cfg.package}/bin/hdfs --config ${hadoopConf} namenode -format -nonInteractive || true
@ -40,24 +67,34 @@ with lib;
User = "hdfs";
SyslogIdentifier = "hdfs-namenode";
ExecStart = "${cfg.package}/bin/hdfs --config ${hadoopConf} namenode";
Restart = "always";
};
};
networking.firewall.allowedTCPPorts = (mkIf cfg.hdfs.namenode.openFirewall [
9870 # namenode.http-address
8020 # namenode.rpc-address
]);
})
(mkIf cfg.hdfs.datanode.enabled {
systemd.services.hdfs-datanode = {
description = "Hadoop HDFS DataNode";
wantedBy = [ "multi-user.target" ];
environment = {
HADOOP_HOME = "${cfg.package}";
};
inherit (cfg.hdfs.datanode) restartIfChanged;
serviceConfig = {
User = "hdfs";
SyslogIdentifier = "hdfs-datanode";
ExecStart = "${cfg.package}/bin/hdfs --config ${hadoopConf} datanode";
Restart = "always";
};
};
networking.firewall.allowedTCPPorts = (mkIf cfg.hdfs.datanode.openFirewall [
9864 # datanode.http.address
9866 # datanode.address
9867 # datanode.ipc.address
]);
})
(mkIf (
cfg.hdfs.namenode.enabled || cfg.hdfs.datanode.enabled

@ -1,24 +1,62 @@
{ config, lib, pkgs, ...}:
with lib;
let
cfg = config.services.hadoop;
hadoopConf = import ./conf.nix { hadoop = cfg; pkgs = pkgs; };
hadoopConf = "${import ./conf.nix { inherit cfg pkgs lib; }}/";
restartIfChanged = mkOption {
type = types.bool;
description = ''
Automatically restart the service on config change.
This can be set to false to defer restarts on clusters running critical applications.
Please consider the security implications of inadvertently running an older version,
and the possibility of unexpected behavior caused by inconsistent versions across a cluster when disabling this option.
'';
default = false;
};
in
with lib;
{
options.services.hadoop.yarn = {
resourcemanager.enabled = mkOption {
type = types.bool;
default = false;
description = ''
Whether to run the Hadoop YARN ResourceManager
'';
resourcemanager = {
enabled = mkOption {
type = types.bool;
default = false;
description = ''
Whether to run the Hadoop YARN ResourceManager
'';
};
inherit restartIfChanged;
openFirewall = mkOption {
type = types.bool;
default = true;
description = ''
Open firewall ports for resourcemanager
'';
};
};
nodemanager.enabled = mkOption {
type = types.bool;
default = false;
description = ''
Whether to run the Hadoop YARN NodeManager
'';
nodemanager = {
enabled = mkOption {
type = types.bool;
default = false;
description = ''
Whether to run the Hadoop YARN NodeManager
'';
};
inherit restartIfChanged;
addBinBash = mkOption {
type = types.bool;
default = true;
description = ''
Add /bin/bash. This is needed by the linux container executor's launch script.
'';
};
openFirewall = mkOption {
type = types.bool;
default = true;
description = ''
Open firewall ports for nodemanager.
Because containers can listen on any ephemeral port, TCP ports 102465535 will be opened.
'';
};
};
};
@ -38,36 +76,63 @@ with lib;
systemd.services.yarn-resourcemanager = {
description = "Hadoop YARN ResourceManager";
wantedBy = [ "multi-user.target" ];
environment = {
HADOOP_HOME = "${cfg.package}";
};
inherit (cfg.yarn.resourcemanager) restartIfChanged;
serviceConfig = {
User = "yarn";
SyslogIdentifier = "yarn-resourcemanager";
ExecStart = "${cfg.package}/bin/yarn --config ${hadoopConf} " +
" resourcemanager";
Restart = "always";
};
};
networking.firewall.allowedTCPPorts = (mkIf cfg.yarn.resourcemanager.openFirewall [
8088 # resourcemanager.webapp.address
8030 # resourcemanager.scheduler.address
8031 # resourcemanager.resource-tracker.address
8032 # resourcemanager.address
]);
})
(mkIf cfg.yarn.nodemanager.enabled {
# Needed because yarn hardcodes /bin/bash in container start scripts
# These scripts can't be patched, they are generated at runtime
systemd.tmpfiles.rules = [
(mkIf cfg.yarn.nodemanager.addBinBash "L /bin/bash - - - - /run/current-system/sw/bin/bash")
];
systemd.services.yarn-nodemanager = {
description = "Hadoop YARN NodeManager";
wantedBy = [ "multi-user.target" ];
inherit (cfg.yarn.nodemanager) restartIfChanged;
environment = {
HADOOP_HOME = "${cfg.package}";
};
preStart = ''
# create log dir
mkdir -p /var/log/hadoop/yarn/nodemanager
chown yarn:hadoop /var/log/hadoop/yarn/nodemanager
# set up setuid container executor binary
rm -rf /run/wrappers/yarn-nodemanager/ || true
mkdir -p /run/wrappers/yarn-nodemanager/{bin,etc/hadoop}
cp ${cfg.package}/lib/${cfg.package.untarDir}/bin/container-executor /run/wrappers/yarn-nodemanager/bin/
chgrp hadoop /run/wrappers/yarn-nodemanager/bin/container-executor
chmod 6050 /run/wrappers/yarn-nodemanager/bin/container-executor
cp ${hadoopConf}/container-executor.cfg /run/wrappers/yarn-nodemanager/etc/hadoop/
'';
serviceConfig = {
User = "yarn";
SyslogIdentifier = "yarn-nodemanager";
PermissionsStartOnly = true;
ExecStart = "${cfg.package}/bin/yarn --config ${hadoopConf} " +
" nodemanager";
Restart = "always";
};
};
networking.firewall.allowedTCPPortRanges = [
(mkIf (cfg.yarn.nodemanager.openFirewall) {from = 1024; to = 65535;})
];
})
];

Loading…
Cancel
Save