carriel-sur weather station
altura : 8.0 mts.
database of wind from the carriel-sur station (january 1966 to the present)
data provided by the dirección meteorológica de Chile (
meteochile.gob.cl ).
by dr(c). claudio iturra
-36.78055° , -73.06639°, Concepcion, Chiledata-processing routines for carriel-sur data
from pathlib import Path
import re
import pandas as pd
HTML_DIR = Path("/home/cl/filen/database/doctoral-thesis/weather-stations/cs/" )
GLOB = "20*.html"
OUT_CSV = HTML_DIR /
( ):
re.sub( , , (s).strip().lower())
( ):
s = (s).strip()
(re. ( , s))
( ):
s = (s).strip()
(re. ( , s))
( ):
i, c (cols):
nc = _norm(c)
(re.search(p, nc) p patterns):
i
( ):
cols = (df.columns)
ni = {i: _norm(c) i,c (cols)}
date_i = _col_idx_by_name(cols, [ ])
time_i = _col_idx_by_name(cols, [ ])
date_i time_i :
date_i, time_i
sample = df.head( ).astype( )
i ( (cols)):
sample.iloc[:, i]. (_looks_like_date).mean() > :
date_i = i
j ( (cols)):
sample.iloc[:, j]. (_looks_like_time).mean() > :
time_i = j
date_i : date_i =
time_i : time_i =
date_i, time_i
( ):
s = _norm(s)
s { , , , , , , , }
( ):
cols = (df.columns)
ncols = [_norm(c) c cols]
dir_i = _col_idx_by_name(cols, [ , , ])
spd_i = _col_idx_by_name(cols, [ , , ])
vrb_i = _col_idx_by_name(cols, [ ])
dir_i spd_i :
date_i, time_i = _find_datetime_cols(df)
dt = pd.to_datetime(
df.iloc[:, date_i].astype( ). .strip() + +
df.iloc[:, time_i].astype( ). .strip(),
errors= ,
dayfirst=
)
out = pd.DataFrame({
: dt,
: pd.to_numeric(df.iloc[:, dir_i], errors= ),
: pd.to_numeric(df.iloc[:, spd_i], errors= ),
})
vrb_i :
out[ ] = df.iloc[:, vrb_i]. (_to_bool_vrb)
:
out[ ] = pd.NA
out[ ] = src_name
out.dropna(subset=[ ]).sort_values( )
( ):
frames = []
:
tbl pd.read_html(path, flavor= , header= ):
res = parse_one_table(tbl, path.name)
res res.empty:
frames.append(res)
Exception:
frames:
pd.DataFrame(columns=[ , , , , ])
df = pd.concat(frames, ignore_index= )
df = df.drop_duplicates(subset=[ ], keep= )
df
( ):
all_frames = []
p (html_dir.glob(pattern)):
df = parse_file(p)
df.empty:
( )
:
all_frames.append(df)
all_frames:
pd.DataFrame(columns=[ , , , , ])
merged = pd.concat(all_frames, ignore_index= )
merged = (merged.sort_values([ , ])
.drop_duplicates(subset=[ ], keep= ))
merged
__name__ == :
ts = parse_all()
ts.to_csv(OUT_CSV, index= )
( )