Commit 10c27789 authored by Ryan Taylor's avatar Ryan Taylor

initial commit

parent bc1199a5
terraform/.terraform/
terraform/terraform.tfstate
terraform/terraform.tfstate.backup
---
# Set up a management node to run FAHControl and monitor the clients.
- hosts: controller
roles:
- folding-controller
become: yes
become_user: root
Role Name
=========
Set up a Folding@home client (FAHClient).
List of active projects: https://apps.foldingathome.org/psummary
---
# defaults file for folding-at-home
fah_user: "anonymous"
# Compute Canada team https://stats.foldingathome.org/team/250396
fah_team: "250396"
# If you don't want to be anonymous, get a passkey
# from https://apps.foldingathome.org/passkey/create
#fah_passkey:
# A password used to secure FAHControl access to the client nodes.
#fah_adminpass:
# The IP address of the FAHControl node.
# FAHClients will allow remote access on port 36330 from this address.
fah_manager: ""
apply_updates: "false"
timezone: "America/Vancouver"
---
# download from https://foldingathome.org/start-folding/
- name: Install FAHClient
yum:
name: https://download.foldingathome.org/releases/public/release/fahclient/centos-6.7-64bit/v7.5/fahclient-7.5.1-1.x86_64.rpm
state: present
validate_certs: yes
when: ansible_facts.packages['fahclient'] is not defined
register: fah_install
# The client seems to overwrite the config when it starts the 1st time (?)
# So in order to make the config change persist we need to stop it first.
- name: Stop FAHClient so we can configure it
sysvinit:
name: "FAHClient"
state: "stopped"
when: fah_install.changed
# A handler doesn't work well here because we don't want to reload
# at the end, after it was already stopped and started.
- name: Configure FAHClient
template:
src: "config.xml.j2"
dest: "/etc/fahclient/config.xml"
owner: fahclient
group: root
mode: 0644
register: fah_config
# FIXME: use of 'pattern' is a workaround in these sysvinit tasks
# Remove after https://github.com/ansible/ansible/pull/68472
- name: Check if FAHClient is running
sysvinit:
name: "FAHClient"
state: "started"
pattern: "/usr/bin/FAHClient"
register: fah_running_check
check_mode: yes
changed_when: no
# If it was already running, we need this to apply config change
- name: Reload FAHClient if configuration changed
sysvinit:
name: "FAHClient"
state: "reloaded"
pattern: "/usr/bin/FAHClient"
when:
- fah_config.changed
- not fah_running_check.changed
- name: Ensure FAHClient is enabled and running
sysvinit:
name: "FAHClient"
state: "started"
enabled: "yes"
pattern: "/usr/bin/FAHClient"
---
- name: Gather package facts
package_facts:
check_mode: no
# tags:
# - repos
# - update
- name: set timezone
timezone:
name: "{{ timezone }}"
- { import_tasks: storage.yml, tags: ['storage'] }
- { import_tasks: update.yml, tags: ['update'], when: apply_updates|bool }
- { import_tasks: config.yml, tags: ['config'] }
---
# When booting from a snapshot, the attached vdb scratch volume
# would need additional post-boot configuration. Don't think we need the space for now.
- name: Remove default mount
mount:
path: "/mnt"
src: "/dev/vdb"
state: "absent"
#- name: Make scratch filesystem
# filesystem:
# dev: "/dev/vdb"
# fstype: "ext4"
# resizefs: "yes"
#- name: Mount scratch filesystem and add to /etc/fstab
# mount:
# src: "/dev/vdb"
# path: "/var/lib/fahclient"
# state: "mounted"
# fstype: "ext4"
# passno: "2"
---
- name: Apply yum updates
yum:
name: "*"
state: "latest"
register: yum_update
- name: Re-gather package facts after update
package_facts:
check_mode: no
when: yum_update.changed
- name: Check if we need to reboot into new kernel
set_fact:
kernel_reboot: true
vars:
installed_release: '{{ item.release | regex_replace("\.el7") }}'
running_release: '{{ ansible_facts["kernel"].split("-")[1] | regex_replace("\.el7\.x86_64") }}'
with_items: "{{ ansible_facts.packages['kernel'] }}"
loop_control:
label: "{{ installed_release }}"
when: installed_release is version(running_release, '>')
# can add extra conditions here
- name: Reboot for kernel update
reboot:
when:
- kernel_reboot|default(false)
register: node_reboot
#- name: Re-gather facts after reboot
# setup:
# when: node_reboot.changed
{#
Note: FAHClient parses and re-writes the config file,
so we try to match what it will write to maintain idempotence.
Also see: /usr/share/doc/fahclient/sample-config.xml
#}
<config>
<!-- Client Control -->
<idle-seconds v='0'/>
<!-- Folding Slot Configuration -->
<client-type v='advanced'/>
<max-packet-size v='big'/>
<!-- HTTP Server -->
<allow v='127.0.0.1 {{ fah_manager }}'/>
<!-- Remote Command Server -->
<password v='{{ fah_adminpass }}'/>
<!-- Slot Control -->
<power v='full'/>
{% if 'gpu_nodes' in group_names %}
<gpu v='true'/>
{% endif %}
<!-- User Information -->
{% if fah_passkey is defined %}
<passkey v='{{ fah_passkey }}'/>
{% endif %}
<team v='{{ fah_team }}'/>
<user v='{{ fah_user }}'/>
<!-- Folding Slots -->
<slot id='0' type='CPU'/>
{% if 'gpu_nodes' in group_names %}
<slot id='1' type='GPU'/>
{% endif %}
</config>
\ No newline at end of file
Role Name
=========
Configure an admin node for Folding At Home that runs FAHController and FAHViewer.
This node needs to reach port 36330 on the FAHClient nodes.
---
# A place to store some scripts and files
fahcontrol_directory: "/home/centos"
# user account which will run FAHControl
fahcontrol_user: "centos"
---
- name: Install packages
yum:
name:
- https://download.foldingathome.org/releases/public/release/fahcontrol/centos-6.7-64bit/v7.5/fahcontrol-7.5.1-1.noarch.rpm
- https://download.foldingathome.org/releases/public/release/fahviewer/centos-6.7-64bit/v7.5/fahviewer-7.5.1-1.x86_64.rpm
- xorg-x11-xauth
- mesa-dri-drivers
- name: Dump inventory
template:
src: inventory-dump.j2
dest: "{{ fahcontrol_directory }}/inventory-dump"
owner: "{{ fahcontrol_user }}"
group: "{{ fahcontrol_user }}"
register: dump
- name: Write injection script
template:
src: inject-nodes.py.j2
dest: "{{ fahcontrol_directory }}/inject-nodes.py"
owner: "{{ fahcontrol_user }}"
group: "{{ fahcontrol_user }}"
mode: 0750
register: inject
# Should make sure FAHControl is not running now
- name: Inject hosts into FAHControl DB
command: "{{ fahcontrol_directory }}/inject-nodes.py"
become: no
when: dump.changed or inject.changed
#!/usr/bin/python
# Read node names and IP addresses from a CSV file,
# inject into sqlite3 DB for FAHControl. Schema is:
# CREATE TABLE "clients" ("name" text NOT NULL,"address" text NOT NULL,"port" integer NOT NULL,"password" text NOT NULL,PRIMARY KEY (name));
import sqlite3
import csv
from os.path import expanduser
port=36330
password="{{ fah_adminpass }}"
inputfile = "{{ fahcontrol_directory }}" + "/inventory-dump"
home = expanduser("~")
sqlitefile = home + "/.FAHClient/FAHControl.db"
parsed = []
with open(inputfile) as input:
csv_reader = csv.reader(input)
for row in csv_reader:
parsed.append(tuple(row) + (port, password))
conn = sqlite3.connect(sqlitefile)
c = conn.cursor()
c.execute('DELETE FROM clients;',);
print("Deleted " + str(c.rowcount) + " rows from clients table.")
c.executemany('INSERT INTO clients VALUES (?,?,?,?)', parsed)
print("Inserted " + str(c.rowcount) + " rows into clients table.")
conn.commit()
conn.close()
{% for node in groups['cpu_nodes'] | union(groups['gpu_nodes']) %}
{{ hostvars[node].inventory_hostname }},{{ hostvars[node].ansible_host }}
{% endfor %}
---
# Configure CPU and GPU compute nodes to run FAHClient.
- hosts: cpu_nodes:gpu_nodes
roles:
- folding-client
become: yes
become_user: root
# rename this file to main.tf
terraform {
required_version = ">= 0.12"
}
module "openstack" {
source = "./openstack"
cpu_node_count = 260
cpu_node_image = "CentOS-7-x86_64"
cpu_node_flavor = "c32-120gb-425"
gpu_node_count = 100
gpu_node_image = "CentOS-7-x86_64-vGPU"
gpu_node_flavor = "g1-18gb-c4-22gb"
network = "tenant-network"
publickey = "ssh-rsa AAAA...."
}
terraform {
required_version = ">= 0.12"
}
resource "openstack_compute_keypair_v2" "terrafold_key" {
name = var.cluster_name
public_key = var.publickey
}
resource "openstack_compute_instance_v2" "terrafolder" {
name = "${var.cpu_node_name}-terrafolder-${count.index+1}"
count = var.cpu_node_count
image_name = var.cpu_node_image
flavor_name = var.cpu_node_flavor
key_pair = openstack_compute_keypair_v2.terrafold_key.name
security_groups = ["default"]
network {
name = var.network
}
metadata = {
terrafold_gpu = "false"
}
lifecycle {
ignore_changes = [image_name]
}
}
resource "openstack_compute_instance_v2" "gpu_terrafolder" {
name = "${var.gpu_node_name}-terrafolder-${count.index+1}"
count = var.gpu_node_count
image_name = var.gpu_node_image
flavor_name = var.gpu_node_flavor
key_pair = openstack_compute_keypair_v2.terrafold_key.name
security_groups = ["default"]
network {
name = var.network
}
metadata = {
terrafold_gpu = "true"
}
lifecycle {
ignore_changes = [image_name]
}
}
# cluster variables
variable "cluster_name" {
type = string
default = "terrafold"
}
variable "network" {
type = string
}
variable "publickey" {
type = string
}
# CPU node variables
variable "cpu_node_image" {
type = string
}
variable "cpu_node_count" {
type = number
default = 1
}
variable "cpu_node_flavor" {
type = string
}
variable "cpu_node_name" {
type = string
default = "cpu"
}
# GPU node variables
variable "gpu_node_image" {
type = string
}
variable "gpu_node_count" {
type = number
default = 1
}
variable "gpu_node_flavor" {
type = string
}
variable "gpu_node_name" {
type = string
default = "gpu"
}
provider "openstack" {}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment