Installing Apache Spark via Puppet: Difference between revisions

 
(37 intermediate revisions by the same user not shown)
Line 1: Line 1:
= Puppet Setup =


Install Java Module into Puppet
Install Java Module into Puppet
Line 38: Line 39:
}</pre>
}</pre>


= Removing An Existing Connection from DHCP =
== Update Hosts ==
Debian adds a host line that needs to be removed. Remove spark1.lab.bpopp.net from /etc/hosts
== Removing Existing DHCP connection ==
Ran into a situation where existing connections were not correctly removed from the DHCP server and was getting :
<pre>2020-01-22T19:18:14 048c816a [E] Failed to add DHCP reservation for spark1.lab.bpopp.net (192.168.2.157 / 00:50:56:a9:85:94): Entry already exists</pre>
Corrected this issue by running:
<pre>
omshell
connect
new host
set name="spark1.lab.bpopp.net"
open
remove
</pre>
== Removing DNS entries from pfsense ==
Also ran into issues w/ pfsense caching DNS entries and foreman not being able to overwrite them.
1) Use the Status > DHCP Leases to remove the existing entry
2) Remove them from the pfsense command line:
<pre>unbound-control -c /var/unbound/unbound.conf flush spark4.lab.bpopp.net</pre>
And then restart the unbound service from Status > Services > unbound.
= Manual Configuration =


== SSH Setup ==
== SSH Setup ==
Line 44: Line 82:


<pre>
<pre>
ssh spark@localhost
ssh-keygen -t rsa
ssh-keygen -t rsa
ssh-copy-id spark@spark1.lab.bpopp.net
ssh-copy-id spark@spark1.lab.bpopp.net
Line 57: Line 96:


You shouldn't be prompted for a password.
You shouldn't be prompted for a password.


== Spark Config ==
== Spark Config ==
Line 80: Line 118:
# Example:
# Example:
spark.master                    spark://spark1.lab.bpopp.net:7077
spark.master                    spark://spark1.lab.bpopp.net:7077
#spark.driver.memory             2g
spark.executor.memory          4500m
spark.executor.memory              2g
spark.driver.memory             3g
# spark.eventLog.enabled          true
</pre>
# spark.eventLog.dir               hdfs://namenode:8021/directory
 
# spark.serializer                org.apache.spark.serializer.KryoSerializer
=== Create a Spark Service ===
# spark.executor.extraJavaOptions  -XX:+PrintGCDetails -Dkey=value -Dnumbers="one two three"
 
/lib/systemd/system/spark.service
<pre>
[Unit]
Description=Apache Spark Service
[Service]
User=spark
Group=spark
Type=forking
ExecStart=/usr/local/spark/sbin/start-all.sh
ExecStop=/usr/local/spark/sbin/stop-all.sh
WorkingDirectory=/home/spark
Environment=JAVA_HOME=/usr/lib/jvm/java-1.8.0-openjdk-amd64/
Environment=SPARK_HOME=/usr/local/spark
TimeoutStartSec=2min
Restart=on-failure
PIDFile=spark-spark-org.apache.spark.deploy.master.Master-1.pid
 
 
[Install]
WantedBy=multi-user.target
 
</pre>
 
== Hadoop Config ==
 
/usr/local/hadoop/etc/hadoop/hdfs-site.xml
<pre>
 
<!-- Put site-specific property overrides in this file. -->
 
<configuration>
    <property>
        <name>dfs.replication</name>
        <value>1</value>
    </property>
 
<property>
    <name>dfs.namenode.name.dir</name>
    <value>file:/home/spark/hdfs/namenode</value>
</property>
 
<property>
    <name>dfs.datanode.data.dir</name>
    <value>file:/home/spark/hdfs/dfs</value>
</property>
</configuration>
</pre>
 
/usr/local/hadoop/etc/hadoop/core-site.xml
<pre>
<!-- Put site-specific property overrides in this file. -->
 
<configuration>
    <property>
        <name>fs.defaultFS</name>
        <value>hdfs://0.0.0.0:9000</value>
    </property>
  <property>
        <name>hadoop.http.staticuser.user</name>
        <value>spark</value>
  </property>
 
</configuration>
 
</pre>
 
=== Create a Hadoop Service ===
 
/lib/systemd/system/hadoop.service
<pre>
[Unit]
Description=Hadoop DFS namenode and datanode
[Service]
User=spark
Group=spark
Type=forking
ExecStart=/usr/local/hadoop/sbin/start-all.sh
ExecStop=/usr/local/hadoop/sbin/stop-all.sh
WorkingDirectory=/home/spark
Environment=JAVA_HOME=/usr/lib/jvm/java-1.8.0-openjdk-amd64/
Environment=HADOOP_HOME=/usr/local/hadoop
TimeoutStartSec=2min
Restart=on-failure
PIDFile=/tmp/hadoop-spark-namenode.pid
 
 
[Install]
WantedBy=multi-user.target
 
</pre>
 
== Jupyter Config ==
 
=== Install ===
 
<pre>
sudo python3 -m pip install jupyter
</pre>
 
<pre>
ssh spark@localhost
jupyter notebook --generate-config
jupyter notebook password
</pre>
 
/home/spark/.jupyter/jupyter_notebook_config.py
<pre>
## The IP address the notebook server will listen on.
c.NotebookApp.ip = '0.0.0.0'  #default= localhost
</pre>
 
 
=== Create Jupyter Service ===
 
/home/spark/.jupyter/env
<pre>
PYSPARK_PYTHON=/usr/bin/python3
HADOOP_HOME=/usr/local/hadoop
SPARK_DIST_CLASSPATH=/usr/local/hadoop/etc/hadoop:/usr/local/hadoop/share/hadoop/common/lib/*:/usr/local/hadoop/share/hadoop/common/*:/usr/local/hadoop/share/hadoop/hdfs:/usr/local/hadoop/share/hadoop/hdfs/lib/*:/usr/local/hadoop/share/hadoop/hdfs/*:/usr/local/hadoop/share/hadoop/mapreduce/lib/*:/usr/local/hadoop/share/hadoop/mapreduce/*:/usr/local/hadoop/share/hadoop/yarn:/usr/local/hadoop/share/hadoop/yarn/lib/*:/usr/local/hadoop/share/hadoop/yarn/*
SPARK_HOME=/usr/local/spark
JAVA_HOME=/usr/lib/jvm/java-1.8.0-openjdk-amd64/
PYSPARK_SUBMIT_ARGS=--master spark://spark1.lab.bpopp.net:7077 pyspark-shell
PYTHONPATH=/usr/local/spark/python:/usr/local/spark/python/lib/py4j-0.10.7-src.zip
</pre>
 
Exit back to a sudo user
 
<pre>exit</pre>
 
/lib/systemd/system/jupyter.service
<pre>
[Unit]
Description=Jupyter Notebook Server
 
[Service]
Type=simple
PIDFile=/run/jupyter.pid
 
EnvironmentFile=/home/spark/.jupyter/env
 
# Jupyter Notebook: change PATHs as needed for your system
ExecStart=/usr/local/bin/jupyter notebook
 
User=spark
Group=spark
WorkingDirectory=/home/spark/work
Restart=always
RestartSec=10
#KillMode=mixed
 
[Install]
WantedBy=multi-user.target
 
</pre>
 
== Start the Services ==
 
<pre>
ssh spark@localhost
/usr/local/spark/sbin/start-all.sh
/usr/local/hadoop/sbin/start-all.sh
</pre>
 
Or better yet, using the service:
 
<pre>
sudo service spark start
sudo service hadoop start
sudo service jupyter start
</pre>
 
Finally, schedule the services to start on boot
<pre>
sudo systemctl enable spark
sudo systemctl enable hadoop
sudo systemctl enable jupyter
 
</pre>
 
== Install Some Packages ==
 
 
Install pyspark
<pre>
sudo pip3 install pyspark
sudo pip3 install plotly
sudo pip3 install pandas
</pre>
</pre>

Latest revision as of 18:22, 15 October 2020

Puppet Setup

Install Java Module into Puppet

/etc/puppetlabs/code/environments/production$ sudo /opt/puppetlabs/bin/puppet module install puppetlabs/java

Install Spark Module to /etc/puppetlabs/code/environments/production/manifests/spark.pp Note that this hard codes server names. Not ideal, but it's a starting point.

$master_hostname='spark-master.bpopp.net'

class{'hadoop':
  realm         => '',
  hdfs_hostname => $master_hostname,
  slaves        => ['spark1.bpopp.net', 'spark2.bpopp.net'],
}

class{'spark':
  master_hostname        => $master_hostname,
  hdfs_hostname          => $master_hostname,
  historyserver_hostname => $master_hostname,
  yarn_enable            => false,
}

node 'spark-master.bpopp.net' {
  include spark::master
  include spark::historyserver
  include hadoop::namenode
  include spark::hdfs
}

node /spark(1|2).bpopp.net/ {
  include spark::worker
  include hadoop::datanode
}

node 'client.bpopp.net' {
  include hadoop::frontend
  include spark::frontend
}

Removing An Existing Connection from DHCP

Update Hosts

Debian adds a host line that needs to be removed. Remove spark1.lab.bpopp.net from /etc/hosts


Removing Existing DHCP connection

Ran into a situation where existing connections were not correctly removed from the DHCP server and was getting :

2020-01-22T19:18:14 048c816a [E] Failed to add DHCP reservation for spark1.lab.bpopp.net (192.168.2.157 / 00:50:56:a9:85:94): Entry already exists

Corrected this issue by running:

omshell
connect
new host
set name="spark1.lab.bpopp.net"
open 
remove

Removing DNS entries from pfsense

Also ran into issues w/ pfsense caching DNS entries and foreman not being able to overwrite them.

1) Use the Status > DHCP Leases to remove the existing entry

2) Remove them from the pfsense command line:

unbound-control -c /var/unbound/unbound.conf flush spark4.lab.bpopp.net

And then restart the unbound service from Status > Services > unbound.

Manual Configuration

SSH Setup

The master must be able to SSH to the slaves without a password. To do this, you typically use certificates from the master loaded to each slave. From the master:

ssh spark@localhost
ssh-keygen -t rsa
ssh-copy-id spark@spark1.lab.bpopp.net
ssh-copy-id spark@spark2.lab.bpopp.net
ssh-copy-id spark@spark3.lab.bpopp.net

Make sure it worked by trying:

ssh spark@localhost

You shouldn't be prompted for a password.

Spark Config

/usr/local/spark/conf/slaves

# A Spark Worker will be started on each of the machines listed below.
spark1
spark2
spark3
#spark4

/usr/local/spark/conf/spark-env.sh

export SPARK_DIST_CLASSPATH=$(/usr/local/hadoop/bin/hadoop --config /usr/local/hadoop/etc/hadoop classpath)
export SPARK_DIST_CLASSPATH=$(/usr/local/hadoop/bin/hadoop classpath)

/usr/local/spark/conf/spark-defaults.conf

# Example:
spark.master                     spark://spark1.lab.bpopp.net:7077
spark.executor.memory           4500m
spark.driver.memory             3g

Create a Spark Service

/lib/systemd/system/spark.service

[Unit]
Description=Apache Spark Service
[Service]
User=spark
Group=spark
Type=forking
ExecStart=/usr/local/spark/sbin/start-all.sh
ExecStop=/usr/local/spark/sbin/stop-all.sh
WorkingDirectory=/home/spark
Environment=JAVA_HOME=/usr/lib/jvm/java-1.8.0-openjdk-amd64/
Environment=SPARK_HOME=/usr/local/spark
TimeoutStartSec=2min
Restart=on-failure
PIDFile=spark-spark-org.apache.spark.deploy.master.Master-1.pid


[Install]
WantedBy=multi-user.target

Hadoop Config

/usr/local/hadoop/etc/hadoop/hdfs-site.xml


<!-- Put site-specific property overrides in this file. -->

<configuration>
    <property>
        <name>dfs.replication</name>
        <value>1</value>
    </property>

<property>
    <name>dfs.namenode.name.dir</name>
    <value>file:/home/spark/hdfs/namenode</value>
</property>

<property>
    <name>dfs.datanode.data.dir</name>
    <value>file:/home/spark/hdfs/dfs</value>
</property>
</configuration>

/usr/local/hadoop/etc/hadoop/core-site.xml

<!-- Put site-specific property overrides in this file. -->

<configuration>
    <property>
        <name>fs.defaultFS</name>
        <value>hdfs://0.0.0.0:9000</value>
    </property>
   <property>
        <name>hadoop.http.staticuser.user</name>
        <value>spark</value>
   </property>

</configuration>

Create a Hadoop Service

/lib/systemd/system/hadoop.service

[Unit]
Description=Hadoop DFS namenode and datanode
[Service]
User=spark
Group=spark
Type=forking
ExecStart=/usr/local/hadoop/sbin/start-all.sh
ExecStop=/usr/local/hadoop/sbin/stop-all.sh
WorkingDirectory=/home/spark
Environment=JAVA_HOME=/usr/lib/jvm/java-1.8.0-openjdk-amd64/
Environment=HADOOP_HOME=/usr/local/hadoop
TimeoutStartSec=2min
Restart=on-failure
PIDFile=/tmp/hadoop-spark-namenode.pid


[Install]
WantedBy=multi-user.target

Jupyter Config

Install

sudo python3 -m pip install jupyter 
ssh spark@localhost
jupyter notebook --generate-config
jupyter notebook password

/home/spark/.jupyter/jupyter_notebook_config.py

## The IP address the notebook server will listen on.
c.NotebookApp.ip = '0.0.0.0'  #default= localhost


Create Jupyter Service

/home/spark/.jupyter/env

PYSPARK_PYTHON=/usr/bin/python3
HADOOP_HOME=/usr/local/hadoop
SPARK_DIST_CLASSPATH=/usr/local/hadoop/etc/hadoop:/usr/local/hadoop/share/hadoop/common/lib/*:/usr/local/hadoop/share/hadoop/common/*:/usr/local/hadoop/share/hadoop/hdfs:/usr/local/hadoop/share/hadoop/hdfs/lib/*:/usr/local/hadoop/share/hadoop/hdfs/*:/usr/local/hadoop/share/hadoop/mapreduce/lib/*:/usr/local/hadoop/share/hadoop/mapreduce/*:/usr/local/hadoop/share/hadoop/yarn:/usr/local/hadoop/share/hadoop/yarn/lib/*:/usr/local/hadoop/share/hadoop/yarn/*
SPARK_HOME=/usr/local/spark
JAVA_HOME=/usr/lib/jvm/java-1.8.0-openjdk-amd64/
PYSPARK_SUBMIT_ARGS=--master spark://spark1.lab.bpopp.net:7077 pyspark-shell
PYTHONPATH=/usr/local/spark/python:/usr/local/spark/python/lib/py4j-0.10.7-src.zip

Exit back to a sudo user

exit

/lib/systemd/system/jupyter.service

[Unit]
Description=Jupyter Notebook Server

[Service]
Type=simple
PIDFile=/run/jupyter.pid

EnvironmentFile=/home/spark/.jupyter/env

# Jupyter Notebook: change PATHs as needed for your system
ExecStart=/usr/local/bin/jupyter notebook

User=spark
Group=spark
WorkingDirectory=/home/spark/work
Restart=always
RestartSec=10
#KillMode=mixed

[Install]
WantedBy=multi-user.target

Start the Services

ssh spark@localhost
/usr/local/spark/sbin/start-all.sh
/usr/local/hadoop/sbin/start-all.sh

Or better yet, using the service:

sudo service spark start
sudo service hadoop start
sudo service jupyter start

Finally, schedule the services to start on boot

sudo systemctl enable spark
sudo systemctl enable hadoop
sudo systemctl enable jupyter

Install Some Packages

Install pyspark

sudo pip3 install pyspark
sudo pip3 install plotly
sudo pip3 install pandas