Init data science project structure

cdf8273c · Thomas Robert · cdf8273c · cdf8273c · cdf8273c · cdf8273c
Commit cdf8273c authored Jan 10, 2017 by Thomas Robert
18 changed files
--- a/.gitignore
+++ b/.gitignore
+/.env
+/config-private.yml
+.gitkeep
+
+/data
+/results
--- a/Makefile
+++ b/Makefile
+
+
+.env: config.yml config-private.yml
+	python src/misc/yaml-to-env.py
--- a/README.md
+++ b/README.md
+# Data science project structure
+
+Based on https://drivendata.github.io/cookiecutter-data-science/
+
+```
+.
+├── Makefile                <- tasks
+├── config.yml              <- config file in YAML, can be exported as env vars if needed
+├── config-private.yml      <- config file with private config (password, api keys, etc.)
+├── data
+│   └── raw
+│   ├── intermediate
+│   ├── processed
+│   ├── temp
+├── results
+│   ├── outputs
+│   ├── models
+├── documents
+│   ├── docs
+│   ├── images
+│   └── references
+├── notebooks               <- notebooks for explorations / prototyping
+└── src                     <- all source code, internal org as needed
+```
--- a/config-private.yml
+++ b/config-private.yml
+
--- a/config.yml
+++ b/config.yml
--- a/data/intermediate/.gitkeep
+++ b/data/intermediate/.gitkeep
--- a/data/processed/.gitkeep
+++ b/data/processed/.gitkeep
--- a/data/raw/.gitkeep
+++ b/data/raw/.gitkeep
--- a/data/temp/.gitkeep
+++ b/data/temp/.gitkeep
--- a/documents/docs/.gitkeep
+++ b/documents/docs/.gitkeep
--- a/documents/images/.gitkeep
+++ b/documents/images/.gitkeep
--- a/documents/references/.gitkeep
+++ b/documents/references/.gitkeep
--- a/notebooks/.gitkeep
+++ b/notebooks/.gitkeep
--- a/results/.gitkeep
+++ b/results/.gitkeep
--- a/results/models/.gitkeep
+++ b/results/models/.gitkeep
--- a/results/outputs/.gitkeep
+++ b/results/outputs/.gitkeep
--- a/src/.gitkeep
+++ b/src/.gitkeep
--- a/src/misc/yaml-to-env.py
+++ b/src/misc/yaml-to-env.py
+#!/usr/bin/env python
+
+import yaml
+import pipes
+
+# Merge data structures
+def merge(a, b):
+    if isinstance(a, dict) and isinstance(b, dict):
+        d = dict(a)
+        d.update({k: merge(a.get(k, None), b[k]) for k in b})
+        return d
+
+    if isinstance(a, list) and isinstance(b, list):
+        return [merge(x, y) for x, y in itertools.izip_longest(a, b)]
+
+    return a if b is None else b
+
+# Read config file, keep env
+def readFileKeepEnv(filename):
+    f = open("config.yml", "r")
+    out = ""
+    for line in f:
+        if "#env" in line:
+            out += line + "\n"
+    return out
+
+# Load config files
+config = yaml.load(readFileKeepEnv("config.yml"))
+config_priv = yaml.load(readFileKeepEnv("config-private.yml"))
+config = merge(config, config_priv)
+
+print config
+
+# Export as env vars
+# TODO generalise to nested dict
+envFile = ""
+for k, v in config.items():
+    k = pipes.quote(k)
+    v = pipes.quote(v)
+    envFile += "%s=%s\n" % (k, v)
+open(".env", "w").write(envFile)