carriel-sur.ace.me

carriel-sur weather station

altura : 8.0 mts.
database of wind from the carriel-sur station (january 1966 to the present)
data provided by the dirección meteorológica de Chile (meteochile.gob.cl).
by dr(c). claudio iturra

171

data-processing routines for carriel-sur data

from pathlib import Path import re import pandas as pd # -------- settings -------- HTML_DIR = Path("/home/cl/filen/database/doctoral-thesis/weather-stations/cs/") # folder with your *.html files GLOB = "20*.html" # e.g., 200611.html, 200612.html, etc. OUT_CSV = HTML_DIR / (): re.sub(, , (s).strip().lower()) (): s = (s).strip() (re.(, s)) (): s = (s).strip() (re.(, s)) (): i, c (cols): nc = _norm(c) (re.search(p, nc) p patterns): i (): cols = (df.columns) ni = {i: _norm(c) i,c (cols)} date_i = _col_idx_by_name(cols, []) time_i = _col_idx_by_name(cols, []) date_i time_i : date_i, time_i sample = df.head().astype() i ((cols)): sample.iloc[:, i].(_looks_like_date).mean() > : date_i = i j ((cols)): sample.iloc[:, j].(_looks_like_time).mean() > : time_i = j date_i : date_i = time_i : time_i = date_i, time_i (): s = _norm(s) s {,,,,,,,} (): cols = (df.columns) ncols = [_norm(c) c cols] dir_i = _col_idx_by_name(cols, [, , ]) spd_i = _col_idx_by_name(cols, [, , ]) vrb_i = _col_idx_by_name(cols, []) dir_i spd_i : date_i, time_i = _find_datetime_cols(df) dt = pd.to_datetime( df.iloc[:, date_i].astype()..strip() + + df.iloc[:, time_i].astype()..strip(), errors=, dayfirst= ) out = pd.DataFrame({ : dt, : pd.to_numeric(df.iloc[:, dir_i], errors=), : pd.to_numeric(df.iloc[:, spd_i], errors=), }) vrb_i : out[] = df.iloc[:, vrb_i].(_to_bool_vrb) : out[] = pd.NA out[] = src_name out.dropna(subset=[]).sort_values() (): frames = [] : tbl pd.read_html(path, flavor=, header=): res = parse_one_table(tbl, path.name) res res.empty: frames.append(res) Exception: frames: pd.DataFrame(columns=[,,,,]) df = pd.concat(frames, ignore_index=) df = df.drop_duplicates(subset=[], keep=) df (): all_frames = [] p (html_dir.glob(pattern)): df = parse_file(p) df.empty: () : all_frames.append(df) all_frames: pd.DataFrame(columns=[,,,,]) merged = pd.concat(all_frames, ignore_index=) merged = (merged.sort_values([,]) .drop_duplicates(subset=[], keep=)) merged __name__ == : ts = parse_all() ts.to_csv(OUT_CSV, index=) ()

time wind_dir_deg wind_speed_kt vrb source_file 0 2004-01-01 00:00:00 20.0 9.0 False 200401.html 1 2004-01-01 01:00:00 20.0 4.0 False 200401.html 2 2004-01-01 02:00:00 20.0 4.0 False 200401.html 3 2004-01-01 03:00:00 20.0 3.0 False 200401.html 4 2004-01-01 04:00:00 90.0 3.0 False 200401.html ... ... ... ... ... ... 26278 2006-12-31 19:00:00 250.0 14.0 False 200612.html 26279 2006-12-31 20:00:00 230.0 13.0 False 200612.html 26280 2006-12-31 21:00:00 230.0 13.0 False 200612.html 26281 2006-12-31 22:00:00 240.0 19.0 False 200612.html 26282 2006-12-31 23:00:00 230.0 15.0 False 200612.html