diff options
| author | unwox <me@unwox.com> | 2024-09-27 15:26:33 +0600 |
|---|---|---|
| committer | unwox <me@unwox.com> | 2024-09-27 15:44:16 +0600 |
| commit | dd449357f502dbe9ca4487d4b06a06ee4e597146 (patch) | |
| tree | 9847488a6cc2c1aaf1fc80578e1a7a5d4af99ff5 /parser | |
| parent | 9b82db238f9e2e02a76f95c793f8d6ef2387ecfd (diff) | |
new structure
Diffstat (limited to 'parser')
| -rw-r--r-- | parser/artoftea.fnl | 71 | ||||
| -rw-r--r-- | parser/ipuer.fnl | 70 | ||||
| -rw-r--r-- | parser/ozchai.fnl | 69 | ||||
| -rw-r--r-- | parser/parser.fnl | 143 |
4 files changed, 353 insertions, 0 deletions
diff --git a/parser/artoftea.fnl b/parser/artoftea.fnl new file mode 100644 index 0000000..1f03ed1 --- /dev/null +++ b/parser/artoftea.fnl @@ -0,0 +1,71 @@ +(import-macros {: map} :lib.macro) + +(local peg + (if (pick-values 1 (pcall require :lpeg)) + (require :lpeg) + (require :lpeglj))) +(local parser (require :parser.parser)) +(local number (require :lib.number)) +(local fetcher (require :fetcher)) + +(fn url-formatter [path page] + (.. "https://artoftea.ru/" path "/?page=" page)) + +(local product-peg + (* ;; id + (parser.anywhere + (parser.tag :div {:class "front-image"} + (parser.tag :a {:href (peg.Cg (parser.till "\"") :url)} + (parser.tag :img {:src (peg.Cg (parser.till "\"") :image) + :title "*" :class "*" :alt "*"})))) + (parser.anywhere + (parser.tag :div {:class "name"} + (parser.tag :a {:href "*"} (peg.Cg (parser.till "</a>") :title)))) + (parser.anywhere + (parser.tag :p {:class "description"} + (peg.Cg (parser.till "</p>") :description))) + (+ + (* + (parser.anywhere + (parser.tag :option {:value "*" :selected "selected"} + (* (peg.Cg parser.pegs.number :weight) " гр" parser.pegs.spaces))) + (parser.anywhere + (parser.tag :p {:class "price"} + (parser.tag :span {:id "*"} + (peg.Cg (parser.till "</span>") :price))))) + (parser.anywhere + (parser.tag :p {:class "price"} + (parser.tag :span {:id "*"} + (peg.Cg (parser.till "</span>") :price))))) + (parser.anywhere + (parser.tag :input {:type "hidden" + :name "product_id" + :value (peg.Cg parser.pegs.number :id)})) + (parser.anywhere + (parser.tag :button {:type "*" :onclick "*" :class "*"} "Купить")))) + +(fn normalize [product] + (local year (parser.guess-year product.title)) + (local weight (number.string->number product.weight)) + (local price (number.string->number product.price)) + {:site "artoftea" + :id product.id + :url product.url + :description product.description + :image product.image + :year year + :price price + :weight weight + :price-per (if (and price weight (< 0 weight)) + (/ (math.ceil (* (/ price weight) 10)) 10) + nil)}) + +(fn products [] + (fetcher.from-html + [{:path "redtea" :category "Красный чай"} + {:path "greentea" :category "Зеленый чай"}] + url-formatter + product-peg + normalize)) + +{: products} diff --git a/parser/ipuer.fnl b/parser/ipuer.fnl new file mode 100644 index 0000000..7fefd1b --- /dev/null +++ b/parser/ipuer.fnl @@ -0,0 +1,70 @@ +(import-macros {: map} :lib.macro) + +(local peg + (if (pick-values 1 (pcall require :lpeg)) + (require :lpeg) + (require :lpeglj))) +(local number (require :lib.number)) +(local parser (require :parser.parser)) +(local fetcher (require :fetcher)) + +(fn url-formatter [path page] + (.. "https://ipuer.ru/catalog/" path "/?p=" page)) + +(local product-peg + (* ;; id + (parser.anywhere + (parser.tag :div + {:data-id (peg.Cg parser.pegs.number :id) :class "*"})) + ;; url and image + (parser.anywhere + (parser.tag :a {:href (peg.Cg (parser.till "\"") :url)} + (parser.tag :img {:src (peg.Cg (parser.till "\"") :image) :alt "*"}))) + ;; title + (parser.anywhere + (parser.tag :div {:class "card-product_title"} + (parser.tag :a {:href "*"} + (parser.tag :span {} (peg.Cg (parser.till "</span>") :title))))) + ;; price + (parser.anywhere + (parser.tag :span {:class "card-price"} + (* (peg.Cg + (* parser.pegs.number + (parser.maybe (* " " parser.pegs.number))) + :price) + " р."))) + (parser.anywhere + (+ (parser.tag :a {:data-url "*" :class "*" :data-add-text "*"} "В корзину") + (parser.tag :a {:data-url "*" :class "*"} "В корзину"))))) + +(fn normalize [product] + (local weight (parser.guess-weight product.title)) + (local price (number.string->number product.price)) + {:site "ipuer" + :id product.id + :url (.. "https://ipuer.ru" product.url) + :title product.title + :description "" + ;; FIXME: parse all editions into different projects + :image (.. "https://ipuer.ru" product.image) + :year (parser.guess-year product.title) + :price price + :weight weight + :category product.category + :price-per (if (and price weight (< 0 weight)) + (/ (math.ceil (* (/ price weight) 10)) 10) + nil)}) + +(fn products [] + (fetcher.from-html + [{:path "shen-puer" :category "Шен пуэр"} + {:path "shu-puer" :category "Шу пуэр"} + {:path "drugoy-chay"} + {:path "blagovoniya" :category "Благовония"} + {:path "posuda" :category "Посуда"} + {:path "282" :category "Посуда"}] + url-formatter + product-peg + normalize)) + +{: products} diff --git a/parser/ozchai.fnl b/parser/ozchai.fnl new file mode 100644 index 0000000..6bf6286 --- /dev/null +++ b/parser/ozchai.fnl @@ -0,0 +1,69 @@ +(import-macros {: map} :lib.macro) + +(local http (require :lib.http)) +(local array (require :lib.array)) +(local json (require :vendor.json)) + +(local %all-products-partuid 176163172341) + +(fn string->number [str] + (if str + (tonumber (pick-values 1 (str:gsub "[^0-9.]" ""))) + nil)) + +(fn request [partuid slice] + (print (.. "https://store.tildaapi.com/api/getproductslist/" + "?storepartuid=" + partuid + "&recid=280779251&c=1723216515077" + "&getparts=true&getoptions=true&slice=%d&size=36")) + (let [(status headers body) + (luna.http.request + "GET" + (string.format + (.. "https://store.tildaapi.com/api/getproductslist/" + "?storepartuid=" + partuid + "&recid=280779251&c=1723216515077" + "&getparts=true&getoptions=true&slice=%d&size=36") + slice) + {:Content-Type "application/json" + :User-Agent (http.random-user-agent)} + "")] + (json.decode body))) + +(fn walk-slices [partuid] + (fn gather [slice knil] + (let [{: nextslice : products} (request partuid slice) + res (array.concat knil products)] + (if (= 0 (# products)) + knil + (do + (os.execute "sleep 1") + (gather (+ slice 1) res))))) + (gather 1 [])) + +(fn normalize [_ product] + (local gallery (json.decode product.gallery)) + (local weight (string->number (. (. product.editions 1) :Вес))) + (local price (string->number (. (. product.editions 1) :price))) + {:site "ozchai" + :id product.url + :url product.url + :title product.title + :description product.descr + ;; FIXME: parse all editions into different projects + :image (if (< 0 (# gallery)) + (. (. gallery 1) :img) + "") + :weight weight + :price price + :price-per (if (and price weight (< 0 weight)) + (/ (math.ceil (* (/ price weight) 10)) 10) + nil) + :characteristics product.characteristics}) + +(fn products [] + (map normalize (walk-slices %all-products-partuid))) + +{: products} diff --git a/parser/parser.fnl b/parser/parser.fnl new file mode 100644 index 0000000..b52f881 --- /dev/null +++ b/parser/parser.fnl @@ -0,0 +1,143 @@ +(import-macros {: map} :lib.macro) + +(local number (require :lib.number)) + +(local peg + (if (pick-values 1 (pcall require :lpeg)) + (require :lpeg) + (require :lpeglj))) + +;; "not" is taken >:( +(fn pnot [p] + (- (peg.P 1) (peg.P p))) + +(fn till [p] + (^ (pnot p) 1)) + +(fn maybe [p] + (^ (peg.P p) 0)) + +(fn anywhere [p] + (peg.P [(+ p (* 1 (peg.V 1)))])) + +(local pegs {}) +(tset pegs :number (^ (peg.R "09") 1)) +(tset pegs :letters (^ (+ (peg.R "az") (peg.R "AZ")) 1)) +(tset pegs :space (peg.S "\n\t ")) +(tset pegs :spaces (^ (peg.S "\n\t ") 1)) +(tset pegs :tag-name (+ pegs.letters pegs.number)) +(tset pegs :attr + (peg.Ct (* (peg.Cg (^ (+ pegs.letters "-") 1) :name) + (maybe (* "=\"" (peg.Cg (till "\"") :value) "\""))))) +(tset pegs :self-closing-tag + (* "<" + (peg.Cg + (+ (peg.P "area") "base" "br" "col" "embed" "hr" + "img" "input" "link" "meta" "param" "source" + "track" "wbr") ;; should be case insensitive + :tag) + (peg.Cg (peg.Ct (^ (+ pegs.space (peg.Cg pegs.attr)) 0)) :attrs) + (maybe "/") ">")) +(tset pegs :opening-tag + (* "<" (peg.Cg pegs.tag-name :tag) + (peg.Cg (peg.Ct (^ (+ pegs.space (peg.Cg pegs.attr)) 0)) :attrs) + ">")) +(tset pegs :closing-tag (* "</" pegs.tag-name ">")) +(tset pegs :doctype + (* "<!DOCTYPE HTML" (^ pegs.attr 0) ">")) ;; should be case insensitive +(tset pegs :tag + (peg.P [(peg.Ct (+ pegs.self-closing-tag + (* pegs.opening-tag + (peg.Cg + (peg.Ct + (^ (+ (+ pegs.space (peg.V 1)) + (peg.Cg (till pegs.closing-tag))) + 0)) + :nodes) + pegs.closing-tag)))])) +(tset pegs :html + (* pegs.doctype (peg.Ct (^ (+ pegs.space (peg.Cg pegs.tag)) 0)))) + +(fn tag [tag attrs contents] + (local tag (peg.P tag)) + (local attrs-count (accumulate [sum 0 _ _ (pairs attrs)] (+ 1 sum))) + (local attr-peg + (fn [name value] (* + (^ (peg.P name) 1) + (if (~= value "") + (* "=\"" + ;; wildcard for any value + (if (= value "*") + (till "\"") + (peg.P value)) + "\"") + (maybe (.. "=\" name \"")))))) + (local attrs-peg + (accumulate [sum pegs.spaces + _ rule + (pairs (icollect [k v (pairs attrs)] + (attr-peg k v)))] + (+ rule sum))) + (if contents + (peg.P (* + (^ pegs.space 0) + ;; opening tag + (* "<" tag (^ pegs.space 0) + (^ attrs-peg (- (* attrs-count 2) 1)) + (^ pegs.space 0) ">") + ;; tag contents + (^ pegs.space 0) + (if (= contents "*") + (till (* "</" tag ">")) + contents) + (^ pegs.space 0) + ;; closing tag + (* "</" tag ">"))) + (peg.P (* + (^ pegs.space 0) + ;; opening tag + (* "<" tag (^ pegs.space 0) + (^ attrs-peg (- (* attrs-count 2) 1)) + (^ pegs.space 0) (maybe "/") ">"))))) + +(fn match-many [html tag] + (: (peg.Ct (^ (peg.Ct tag) 1)) + :match html)) + +(fn guess-category [title] + (if (: (anywhere (+ (peg.P "зеленый") "Зеленый")) :match title) + "Зеленый чай" + (: (anywhere (+ (peg.P "Улун") "улун")) :match title) + "Улун" + (: (anywhere (+ (peg.P "Белый") "белый")) :match title) + "Белый чай" + (: (anywhere (+ (peg.P "Желтый") "желтый")) :match title) + "Желтый чай" + (: (anywhere (+ (peg.P "Красный") "красный")) :match title) + "Красный чай" + "Неизвестная категория")) + +(fn guess-year [title] + (number.string->number + (: (anywhere + (* (peg.C (^ (peg.R "09") 4)) + (maybe " ") + (- "г" (peg.P "гр")))) + :match title))) + +(fn guess-weight [title] + (number.string->number + (: (anywhere + (* (peg.C pegs.number) (maybe " ") "гр")) + :match title))) + +{: match-many + : tag + : anywhere + : till + : maybe + : pegs + :not pnot + : guess-category + : guess-year + : guess-weight} |
