summaryrefslogtreecommitdiff
path: root/parser/daochai.fnl
diff options
context:
space:
mode:
authorunwox <me@unwox.com>2025-02-17 19:57:58 +0600
committerunwox <me@unwox.com>2025-02-17 19:57:58 +0600
commitfd807bf1952073aff866bd0961ad6929e07da80d (patch)
tree43c653276b18040da2faee088e7734a345f39df6 /parser/daochai.fnl
parent35541ad02fddd7cb1d8840f47b2a203e125b0acf (diff)
add daochai parser
Diffstat (limited to 'parser/daochai.fnl')
-rw-r--r--parser/daochai.fnl187
1 files changed, 187 insertions, 0 deletions
diff --git a/parser/daochai.fnl b/parser/daochai.fnl
new file mode 100644
index 0000000..50ec508
--- /dev/null
+++ b/parser/daochai.fnl
@@ -0,0 +1,187 @@
+(import-macros {: reduce} :lib.macro)
+
+(local peg
+ (if (pick-values 1 (pcall require :lpeg))
+ (require :lpeg)
+ (require :lpeglj)))
+(local parser (require :parser.parser))
+(local number (require :lib.number))
+(local fetcher (require :fetcher))
+
+(fn html-cyrillic->utf [str]
+ (local replacement-map
+ {"&#1040;" "А"
+ "&#1041;" "Б"
+ "&#1042;" "В"
+ "&#1043;" "Г"
+ "&#1044;" "Д"
+ "&#1045;" "Е"
+ "&#1046;" "Ж"
+ "&#1047;" "З"
+ "&#1048;" "И"
+ "&#1049;" "Й"
+ "&#1050;" "К"
+ "&#1051;" "Л"
+ "&#1052;" "М"
+ "&#1053;" "Н"
+ "&#1054;" "О"
+ "&#1055;" "П"
+ "&#1056;" "Р"
+ "&#1057;" "С"
+ "&#1058;" "Т"
+ "&#1059;" "У"
+ "&#1060;" "Ф"
+ "&#1061;" "Х"
+ "&#1062;" "Ц"
+ "&#1063;" "Ч"
+ "&#1064;" "Ш"
+ "&#1065;" "Щ"
+ "&#1066;" "Ъ"
+ "&#1067;" "Ы"
+ "&#1068;" "Ь"
+ "&#1069;" "Э"
+ "&#1070;" "Ю"
+ "&#1071;" "Я"
+ "&#1072;" "а"
+ "&#1073;" "б"
+ "&#1074;" "в"
+ "&#1075;" "г"
+ "&#1076;" "д"
+ "&#1077;" "е"
+ "&#1078;" "ж"
+ "&#1079;" "з"
+ "&#1080;" "и"
+ "&#1081;" "й"
+ "&#1082;" "к"
+ "&#1083;" "л"
+ "&#1084;" "м"
+ "&#1085;" "н"
+ "&#1086;" "о"
+ "&#1087;" "п"
+ "&#1088;" "р"
+ "&#1089;" "с"
+ "&#1090;" "т"
+ "&#1091;" "у"
+ "&#1092;" "ф"
+ "&#1093;" "х"
+ "&#1094;" "ц"
+ "&#1095;" "ч"
+ "&#1096;" "ш"
+ "&#1097;" "щ"
+ "&#1098;" "ъ"
+ "&#1099;" "ы"
+ "&#1100;" "ь"
+ "&#1101;" "э"
+ "&#1102;" "ю"
+ "&#1103;" "я"})
+
+ (var result str)
+ (each [code letter (pairs replacement-map)]
+ (set result (: result :gsub code letter)))
+ result)
+
+(fn format-url [path page]
+ (.. "https://daochai.ru/" path
+ "/" (if (< 1 page) (.. "page-" page "/") "")))
+
+(local product-peg
+ (*
+ (parser.anywhere
+ (+
+ ;; eager and lazy loaded versions of img
+ (parser.tag :img {:class "ty-pict cm-image" ;; FRAGILE
+ :src (peg.Cg (parser.till "\"") :image)
+ :id "*"
+ :title "*"
+ :alt "*"
+ :srcset "*"
+ :width "*"
+ :height "*"})
+ (parser.tag :img {:class "ty-pict cm-image" ;; FRAGILE
+ :src "*"
+ :data-src (peg.Cg (parser.till "\"") :image)
+ :id "*"
+ :title "*"
+ :alt "*"
+ :data-srcset "*"
+ :width "*"
+ :height "*"})))
+ (parser.anywhere
+ (parser.tag :a {:class "product-title"
+ :href (peg.Cg (parser.till "\"") :url)
+ :title "*"}
+ (peg.Cg (parser.till "</a>") :title)))
+ (+
+ (*
+ (parser.anywhere
+ (parser.tag :span {:class "ty-price-num" :id "*"}
+ (peg.Cg (parser.till "</span>") :price)))
+ (parser.anywhere
+ ;; "за" and "гр" words are html-encoded for some reason
+ (* "&#1079;&#1072; " (peg.Cg parser.pegs.number :weight) " &#1075;&#1088;.")))
+ (parser.anywhere
+ (parser.tag :span {:class "ty-price-num" :id "*"}
+ (peg.Cg (parser.till "</span>") :price))))
+ (parser.anywhere
+ ;; "Купить"
+ (parser.tag :span {} "&#1050;&#1091;&#1087;&#1080;&#1090;&#1100;"))))
+
+(fn normalize [product]
+ (local title (html-cyrillic->utf product.title))
+ (local year (parser.guess-year title))
+ (local weight (number.string->number product.weight))
+ (local price (number.string->number product.price))
+
+ {:site "daochai"
+ :title title
+ :url product.url
+ :description nil
+ :image product.image
+ :year year
+ :price price
+ :weight weight
+ :volume (parser.guess-volume title)
+ :price-per (if (and price weight (< 0 weight))
+ (/ (math.ceil (* (/ price weight) 10)) 10)
+ nil)})
+
+(fn products []
+ (fetcher.from-html
+ [{:path "vid-chaya/pu-erh/shu-puer" :tags ["Шу пуэр"]}
+ {:path "vid-chaya/pu-erh/shen" :tags ["Шен пуэр"]}
+ {:path "vid-chaya/ulun" :tags ["Улун"]}
+ {:path "vid-chaya/ulun/fudzjanskie-uluny" :tags ["Улун" "Фудзянь"]}
+ {:path "vid-chaya/ulun/guandunskie-uluny" :tags ["Улун" "Гуандун"]}
+ {:path "vid-chaya/ulun/uishanskie-uluny" :tags ["Улун" "Уишань"]}
+ {:path "vid-chaya/ulun/taiwan-ulun" :tags ["Улун" "Тайвань"]}
+ {:path "vid-chaya/ulun/yunnanskiy-uluny" :tags ["Улун" "Юннань"]}
+ {:path "vid-chaya/krasnyj-chaj" :tags ["Красный чай"]}
+ {:path "vid-chaya/zeljonyj-chaj" :tags ["Зеленый чай"]}
+ {:path "vid-chaya/white" :tags ["Белый чай"]}
+ {:path "vid-chaya/zheltyy-chay" :tags ["Желтый чай"]}
+ {:path "vid-chaya/heicha" :tags ["Хэй ча"]}
+ {:path "posuda/jianshuizitao" :tags ["Посуда" "Чайник"]}
+ {:path "posuda/nisintao" :tags ["Посуда" "Чайник"]}
+ {:path "posuda/chahu-taozi" :tags ["Посуда" "Чайник"]}
+ {:path "posuda/chayniki-iz-chaochzhou" :tags ["Посуда" "Чайник"]}
+ {:path "posuda/jingdezhen" :tags ["Посуда"]}
+ {:path "posuda/chahai" :tags ["Посуда"]}
+ {:path "posuda/gajvan" :tags ["Посуда"]}
+ {:path "posuda/chahaj" :tags ["Посуда"]}
+ {:path "posuda/chaban" :tags ["Посуда"]}
+ {:path "posuda/chajnye-prudy" :tags ["Посуда"]}
+ {:path "posuda/sito" :tags ["Посуда"]}
+ {:path "posuda/posuda-chajnoj-ceremonii" :tags ["Посуда"]}
+ {:path "posuda/termosy" :tags ["Посуда"]}
+ {:path "posuda/alternativa" :tags ["Посуда"]}
+ {:path "tea-accessorize" :tags ["Посуда"]}
+ {:path "chay-i-chan/aroma" :tags ["Благовония"]}
+ {:path "chay-i-chan/kurilnicy-i-podstavki-pod-blagovoniya" :tags ["Благовония"]}
+ {:path "chay-i-chan/chetki" :tags ["Четки"]}
+ {:path "chay-i-chan/dekorirovanie-prostranstva" :tags ["Декор"]}
+ {:path "chay-i-chan/figurki-iz-dereva" :tags ["Фигурки"]}]
+ format-url
+ product-peg
+ normalize))
+
+{:products products :title "DaoChai" :url "https://daochai.ru"}