summaryrefslogtreecommitdiff
path: root/parser
diff options
context:
space:
mode:
Diffstat (limited to 'parser')
-rw-r--r--parser/artoftea.fnl71
-rw-r--r--parser/ipuer.fnl70
-rw-r--r--parser/ozchai.fnl69
-rw-r--r--parser/parser.fnl143
4 files changed, 353 insertions, 0 deletions
diff --git a/parser/artoftea.fnl b/parser/artoftea.fnl
new file mode 100644
index 0000000..1f03ed1
--- /dev/null
+++ b/parser/artoftea.fnl
@@ -0,0 +1,71 @@
+(import-macros {: map} :lib.macro)
+
+(local peg
+ (if (pick-values 1 (pcall require :lpeg))
+ (require :lpeg)
+ (require :lpeglj)))
+(local parser (require :parser.parser))
+(local number (require :lib.number))
+(local fetcher (require :fetcher))
+
+(fn url-formatter [path page]
+ (.. "https://artoftea.ru/" path "/?page=" page))
+
+(local product-peg
+ (* ;; id
+ (parser.anywhere
+ (parser.tag :div {:class "front-image"}
+ (parser.tag :a {:href (peg.Cg (parser.till "\"") :url)}
+ (parser.tag :img {:src (peg.Cg (parser.till "\"") :image)
+ :title "*" :class "*" :alt "*"}))))
+ (parser.anywhere
+ (parser.tag :div {:class "name"}
+ (parser.tag :a {:href "*"} (peg.Cg (parser.till "</a>") :title))))
+ (parser.anywhere
+ (parser.tag :p {:class "description"}
+ (peg.Cg (parser.till "</p>") :description)))
+ (+
+ (*
+ (parser.anywhere
+ (parser.tag :option {:value "*" :selected "selected"}
+ (* (peg.Cg parser.pegs.number :weight) " гр" parser.pegs.spaces)))
+ (parser.anywhere
+ (parser.tag :p {:class "price"}
+ (parser.tag :span {:id "*"}
+ (peg.Cg (parser.till "</span>") :price)))))
+ (parser.anywhere
+ (parser.tag :p {:class "price"}
+ (parser.tag :span {:id "*"}
+ (peg.Cg (parser.till "</span>") :price)))))
+ (parser.anywhere
+ (parser.tag :input {:type "hidden"
+ :name "product_id"
+ :value (peg.Cg parser.pegs.number :id)}))
+ (parser.anywhere
+ (parser.tag :button {:type "*" :onclick "*" :class "*"} "Купить"))))
+
+(fn normalize [product]
+ (local year (parser.guess-year product.title))
+ (local weight (number.string->number product.weight))
+ (local price (number.string->number product.price))
+ {:site "artoftea"
+ :id product.id
+ :url product.url
+ :description product.description
+ :image product.image
+ :year year
+ :price price
+ :weight weight
+ :price-per (if (and price weight (< 0 weight))
+ (/ (math.ceil (* (/ price weight) 10)) 10)
+ nil)})
+
+(fn products []
+ (fetcher.from-html
+ [{:path "redtea" :category "Красный чай"}
+ {:path "greentea" :category "Зеленый чай"}]
+ url-formatter
+ product-peg
+ normalize))
+
+{: products}
diff --git a/parser/ipuer.fnl b/parser/ipuer.fnl
new file mode 100644
index 0000000..7fefd1b
--- /dev/null
+++ b/parser/ipuer.fnl
@@ -0,0 +1,70 @@
+(import-macros {: map} :lib.macro)
+
+(local peg
+ (if (pick-values 1 (pcall require :lpeg))
+ (require :lpeg)
+ (require :lpeglj)))
+(local number (require :lib.number))
+(local parser (require :parser.parser))
+(local fetcher (require :fetcher))
+
+(fn url-formatter [path page]
+ (.. "https://ipuer.ru/catalog/" path "/?p=" page))
+
+(local product-peg
+ (* ;; id
+ (parser.anywhere
+ (parser.tag :div
+ {:data-id (peg.Cg parser.pegs.number :id) :class "*"}))
+ ;; url and image
+ (parser.anywhere
+ (parser.tag :a {:href (peg.Cg (parser.till "\"") :url)}
+ (parser.tag :img {:src (peg.Cg (parser.till "\"") :image) :alt "*"})))
+ ;; title
+ (parser.anywhere
+ (parser.tag :div {:class "card-product_title"}
+ (parser.tag :a {:href "*"}
+ (parser.tag :span {} (peg.Cg (parser.till "</span>") :title)))))
+ ;; price
+ (parser.anywhere
+ (parser.tag :span {:class "card-price"}
+ (* (peg.Cg
+ (* parser.pegs.number
+ (parser.maybe (* " " parser.pegs.number)))
+ :price)
+ " р.")))
+ (parser.anywhere
+ (+ (parser.tag :a {:data-url "*" :class "*" :data-add-text "*"} "В корзину")
+ (parser.tag :a {:data-url "*" :class "*"} "В корзину")))))
+
+(fn normalize [product]
+ (local weight (parser.guess-weight product.title))
+ (local price (number.string->number product.price))
+ {:site "ipuer"
+ :id product.id
+ :url (.. "https://ipuer.ru" product.url)
+ :title product.title
+ :description ""
+ ;; FIXME: parse all editions into different projects
+ :image (.. "https://ipuer.ru" product.image)
+ :year (parser.guess-year product.title)
+ :price price
+ :weight weight
+ :category product.category
+ :price-per (if (and price weight (< 0 weight))
+ (/ (math.ceil (* (/ price weight) 10)) 10)
+ nil)})
+
+(fn products []
+ (fetcher.from-html
+ [{:path "shen-puer" :category "Шен пуэр"}
+ {:path "shu-puer" :category "Шу пуэр"}
+ {:path "drugoy-chay"}
+ {:path "blagovoniya" :category "Благовония"}
+ {:path "posuda" :category "Посуда"}
+ {:path "282" :category "Посуда"}]
+ url-formatter
+ product-peg
+ normalize))
+
+{: products}
diff --git a/parser/ozchai.fnl b/parser/ozchai.fnl
new file mode 100644
index 0000000..6bf6286
--- /dev/null
+++ b/parser/ozchai.fnl
@@ -0,0 +1,69 @@
+(import-macros {: map} :lib.macro)
+
+(local http (require :lib.http))
+(local array (require :lib.array))
+(local json (require :vendor.json))
+
+(local %all-products-partuid 176163172341)
+
+(fn string->number [str]
+ (if str
+ (tonumber (pick-values 1 (str:gsub "[^0-9.]" "")))
+ nil))
+
+(fn request [partuid slice]
+ (print (.. "https://store.tildaapi.com/api/getproductslist/"
+ "?storepartuid="
+ partuid
+ "&recid=280779251&c=1723216515077"
+ "&getparts=true&getoptions=true&slice=%d&size=36"))
+ (let [(status headers body)
+ (luna.http.request
+ "GET"
+ (string.format
+ (.. "https://store.tildaapi.com/api/getproductslist/"
+ "?storepartuid="
+ partuid
+ "&recid=280779251&c=1723216515077"
+ "&getparts=true&getoptions=true&slice=%d&size=36")
+ slice)
+ {:Content-Type "application/json"
+ :User-Agent (http.random-user-agent)}
+ "")]
+ (json.decode body)))
+
+(fn walk-slices [partuid]
+ (fn gather [slice knil]
+ (let [{: nextslice : products} (request partuid slice)
+ res (array.concat knil products)]
+ (if (= 0 (# products))
+ knil
+ (do
+ (os.execute "sleep 1")
+ (gather (+ slice 1) res)))))
+ (gather 1 []))
+
+(fn normalize [_ product]
+ (local gallery (json.decode product.gallery))
+ (local weight (string->number (. (. product.editions 1) :Вес)))
+ (local price (string->number (. (. product.editions 1) :price)))
+ {:site "ozchai"
+ :id product.url
+ :url product.url
+ :title product.title
+ :description product.descr
+ ;; FIXME: parse all editions into different projects
+ :image (if (< 0 (# gallery))
+ (. (. gallery 1) :img)
+ "")
+ :weight weight
+ :price price
+ :price-per (if (and price weight (< 0 weight))
+ (/ (math.ceil (* (/ price weight) 10)) 10)
+ nil)
+ :characteristics product.characteristics})
+
+(fn products []
+ (map normalize (walk-slices %all-products-partuid)))
+
+{: products}
diff --git a/parser/parser.fnl b/parser/parser.fnl
new file mode 100644
index 0000000..b52f881
--- /dev/null
+++ b/parser/parser.fnl
@@ -0,0 +1,143 @@
+(import-macros {: map} :lib.macro)
+
+(local number (require :lib.number))
+
+(local peg
+ (if (pick-values 1 (pcall require :lpeg))
+ (require :lpeg)
+ (require :lpeglj)))
+
+;; "not" is taken >:(
+(fn pnot [p]
+ (- (peg.P 1) (peg.P p)))
+
+(fn till [p]
+ (^ (pnot p) 1))
+
+(fn maybe [p]
+ (^ (peg.P p) 0))
+
+(fn anywhere [p]
+ (peg.P [(+ p (* 1 (peg.V 1)))]))
+
+(local pegs {})
+(tset pegs :number (^ (peg.R "09") 1))
+(tset pegs :letters (^ (+ (peg.R "az") (peg.R "AZ")) 1))
+(tset pegs :space (peg.S "\n\t "))
+(tset pegs :spaces (^ (peg.S "\n\t ") 1))
+(tset pegs :tag-name (+ pegs.letters pegs.number))
+(tset pegs :attr
+ (peg.Ct (* (peg.Cg (^ (+ pegs.letters "-") 1) :name)
+ (maybe (* "=\"" (peg.Cg (till "\"") :value) "\"")))))
+(tset pegs :self-closing-tag
+ (* "<"
+ (peg.Cg
+ (+ (peg.P "area") "base" "br" "col" "embed" "hr"
+ "img" "input" "link" "meta" "param" "source"
+ "track" "wbr") ;; should be case insensitive
+ :tag)
+ (peg.Cg (peg.Ct (^ (+ pegs.space (peg.Cg pegs.attr)) 0)) :attrs)
+ (maybe "/") ">"))
+(tset pegs :opening-tag
+ (* "<" (peg.Cg pegs.tag-name :tag)
+ (peg.Cg (peg.Ct (^ (+ pegs.space (peg.Cg pegs.attr)) 0)) :attrs)
+ ">"))
+(tset pegs :closing-tag (* "</" pegs.tag-name ">"))
+(tset pegs :doctype
+ (* "<!DOCTYPE HTML" (^ pegs.attr 0) ">")) ;; should be case insensitive
+(tset pegs :tag
+ (peg.P [(peg.Ct (+ pegs.self-closing-tag
+ (* pegs.opening-tag
+ (peg.Cg
+ (peg.Ct
+ (^ (+ (+ pegs.space (peg.V 1))
+ (peg.Cg (till pegs.closing-tag)))
+ 0))
+ :nodes)
+ pegs.closing-tag)))]))
+(tset pegs :html
+ (* pegs.doctype (peg.Ct (^ (+ pegs.space (peg.Cg pegs.tag)) 0))))
+
+(fn tag [tag attrs contents]
+ (local tag (peg.P tag))
+ (local attrs-count (accumulate [sum 0 _ _ (pairs attrs)] (+ 1 sum)))
+ (local attr-peg
+ (fn [name value] (*
+ (^ (peg.P name) 1)
+ (if (~= value "")
+ (* "=\""
+ ;; wildcard for any value
+ (if (= value "*")
+ (till "\"")
+ (peg.P value))
+ "\"")
+ (maybe (.. "=\" name \""))))))
+ (local attrs-peg
+ (accumulate [sum pegs.spaces
+ _ rule
+ (pairs (icollect [k v (pairs attrs)]
+ (attr-peg k v)))]
+ (+ rule sum)))
+ (if contents
+ (peg.P (*
+ (^ pegs.space 0)
+ ;; opening tag
+ (* "<" tag (^ pegs.space 0)
+ (^ attrs-peg (- (* attrs-count 2) 1))
+ (^ pegs.space 0) ">")
+ ;; tag contents
+ (^ pegs.space 0)
+ (if (= contents "*")
+ (till (* "</" tag ">"))
+ contents)
+ (^ pegs.space 0)
+ ;; closing tag
+ (* "</" tag ">")))
+ (peg.P (*
+ (^ pegs.space 0)
+ ;; opening tag
+ (* "<" tag (^ pegs.space 0)
+ (^ attrs-peg (- (* attrs-count 2) 1))
+ (^ pegs.space 0) (maybe "/") ">")))))
+
+(fn match-many [html tag]
+ (: (peg.Ct (^ (peg.Ct tag) 1))
+ :match html))
+
+(fn guess-category [title]
+ (if (: (anywhere (+ (peg.P "зеленый") "Зеленый")) :match title)
+ "Зеленый чай"
+ (: (anywhere (+ (peg.P "Улун") "улун")) :match title)
+ "Улун"
+ (: (anywhere (+ (peg.P "Белый") "белый")) :match title)
+ "Белый чай"
+ (: (anywhere (+ (peg.P "Желтый") "желтый")) :match title)
+ "Желтый чай"
+ (: (anywhere (+ (peg.P "Красный") "красный")) :match title)
+ "Красный чай"
+ "Неизвестная категория"))
+
+(fn guess-year [title]
+ (number.string->number
+ (: (anywhere
+ (* (peg.C (^ (peg.R "09") 4))
+ (maybe " ")
+ (- "г" (peg.P "гр"))))
+ :match title)))
+
+(fn guess-weight [title]
+ (number.string->number
+ (: (anywhere
+ (* (peg.C pegs.number) (maybe " ") "гр"))
+ :match title)))
+
+{: match-many
+ : tag
+ : anywhere
+ : till
+ : maybe
+ : pegs
+ :not pnot
+ : guess-category
+ : guess-year
+ : guess-weight}