Subversion Repositories MeX

Compare Revisions

Ignore whitespace Rev 2 → Rev 3

/trunk/mex/core/mod_env.py
9,7 → 9,8
 
import sys
import os
from os.path import abspath, sep as SEP
from os.path import abspath
from os.path import sep as SEP
 
ROOT = abspath(sys.modules[__name__].__file__ + "/../../")
USER_DIR = os.path.expanduser('~')
/trunk/mex/core/mod_rdf.py
0,0 → 1,130
#!/usr/bin/python
# -*- coding: utf-8 -*-
# File: mod_rdf.py
# Author: Tomás Vírseda
# License: GPL v3
# Description: Namespaces (initially borrowed from SWAML project)
 
"""Common namespaces"""
 
from rdflib import URIRef, Literal
from rdflib import Namespace
from rdflib import ConjunctiveGraph
 
namespace_manager = ConjunctiveGraph()._get_namespace_manager()
 
# MeX URIRef Base
BASE = URIRef("t00mlabs://")
 
# W3C Ontologies
RDF = Namespace("http://www.w3.org/1999/02/22-rdf-syntax-ns#")
RDFS = Namespace("http://www.w3.org/2000/01/rdf-schema#")
OWL = Namespace("http://www.w3.org/2002/07/owl#")
SKOS = Namespace('http://www.w3.org/2004/02/skos/core#')
 
# Dublin Core
DC = Namespace(u'http://purl.org/dc/elements/1.1/') # Dublin Core
DCTERMS = Namespace(u'http://purl.org/dc/terms/') # Dublin Core Terms
 
# NEPOMUK Ontologies
PIMO = Namespace(u'http://www.semanticdesktop.org/ontologies/pimo/') # NEPOMUK Personal Information Model
NAO = Namespace(u'http://www.semanticdesktop.org/ontologies/nao/') # NEPOMUK Annotation Ontology
NIE = Namespace(u'http://www.semanticdesktop.org/ontologies/nie/') # NEPOMUK Information Element Ontology
NCO = Namespace(u'http://www.semanticdesktop.org/ontologies/nco/') # NEPOMUK Contact Ontology
NFO = Namespace(u'http://www.semanticdesktop.org/ontologies/nfo/') # NEPOMUK File Ontology
NID3 = Namespace(u'http://www.semanticdesktop.org/ontologies/nid3/') # NEPOMUK ID3 Ontology
NEXIF = Namespace(u'http://www.semanticdesktop.org/ontologies/nexif/') # NEPOMUK Exif Ontology
TMO = Namespace(u'http://www.semanticdesktop.org/ontologies/2008/05/20/tmo#') # NEPOMUK Task Model Ontology
NMO = Namespace(u'http://www.semanticdesktop.org/ontologies/nmo/') # NEPOMUK Message Ontology
 
# Description Of A Project
DOAP = Namespace(u'http://usefulinc.com/ns/doap#') # DOAP
 
# Others
FOAF = Namespace(u'http://xmlns.com/foaf/0.1/') # Friend Of A Friend
SIOC = Namespace(u"http://rdfs.org/sioc/ns#")
SIOCT = Namespace(u"http://rdfs.org/sioc/types#")
GEO = Namespace(u"http://www.w3.org/2003/01/geo/wgs84_pos#")
MVCB = Namespace(u"http://webns.net/mvcb/")
ICAL = Namespace(u"http://www.w3.org/2002/12/cal/icaltzd#")
XSD = Namespace(u"http://www.w3.org/2001/XMLSchema#")
 
#OPENXML
AP = Namespace(u"https://msdn.microsoft.com/en-us/library/office/documentformat.openxml.extendedproperties.aspx#")
 
 
NSBINDINGS = {
u"rdf" : RDF,
u"rdfs" : RDFS,
u"pimo" : PIMO,
u"nao" : NAO,
u"nco" : NCO,
u"nfo" : NFO,
u"nmo" : NMO,
u"nie" : NIE,
u"nid3" : NID3,
u"nexif" : NEXIF,
u"tmo" : TMO,
u"dc" : DC,
u"dct" : DCTERMS,
u"foaf" : FOAF,
u"sioc" : SIOC,
u"sioct" : SIOCT,
u"geo" : GEO,
u"mvcb" : MVCB,
u"ical" : ICAL,
u"xsd" : XSD,
u"owl" : OWL,
u"skos" : SKOS,
u"doap" : DOAP,
u"ap" : AP,
}
 
NSBINDINGSINV = {}
for ns in NSBINDINGS:
NSBINDINGSINV[NSBINDINGS[ns]] = Namespace(ns)
 
 
"""
keyword['LIBEXTRACTOR KEYWORD-TYPE'] = (
'DC-ELEMENT',
'DC-TERM',
'NEPOMUK-ONTOLOGY', # left part predicate
'NEPOMUK-CLASS/PROPERTY' # right part predicate
)
"""
 
EXTRACTOR_KEYWORD = {}
EXTRACTOR_KEYWORD['album'] = ('title', 'collection', 'nid3', 'albumTitle')
EXTRACTOR_KEYWORD['artist'] = ('creator', '', 'nao', 'creator')
EXTRACTOR_KEYWORD['book title'] = ('title', '', 'nie', 'title')
EXTRACTOR_KEYWORD['author'] = ('creator', '', 'nao', 'creator')
EXTRACTOR_KEYWORD['content type'] = ('xsd', 'string', 'nid3', 'contentType')
EXTRACTOR_KEYWORD['creator'] = ('creator', '', 'nao', 'creator')
EXTRACTOR_KEYWORD['creation date'] = ('date', '', 'nie', 'created')
EXTRACTOR_KEYWORD['date'] = ('date', '', 'nie', 'contentLastModified')
EXTRACTOR_KEYWORD['description'] = ('description', '', 'nao', 'creator')
EXTRACTOR_KEYWORD['generator'] = ('format', 'Software', 'nie', 'generator')
EXTRACTOR_KEYWORD['genre'] = ('xsd', 'string', 'nid3', 'contentType')
EXTRACTOR_KEYWORD['keywords'] = ('subject', '', 'nao', 'hasTag')
EXTRACTOR_KEYWORD['language'] = ('language', '', 'nie', 'language')
EXTRACTOR_KEYWORD['page count'] = ('format', '', 'nfo', 'pageCount')
EXTRACTOR_KEYWORD['producer'] = ('format', 'Software', 'nie', 'generator')
EXTRACTOR_KEYWORD['software'] = ('format', 'Software', 'nie', 'generator')
EXTRACTOR_KEYWORD['size'] = ('format', '', '', '')
EXTRACTOR_KEYWORD['subject'] = ('subject', '', 'nie', 'subject')
EXTRACTOR_KEYWORD['title'] = ('title', '', 'nie', 'title')
EXTRACTOR_KEYWORD['track number'] = ('format', '', '', '')
EXTRACTOR_KEYWORD['year'] = ('date', '', '', '')
EXTRACTOR_KEYWORD['creationdate'] = ('date', '', 'nie', 'created')
#~ EXTRACTOR_KEYWORD['pdf version'] = ()
 
#paragraph count
#line count
#word count
#page count
#character count - 817
#last saved by
#revision history
 
KEYWORD_TYPE = {}
/trunk/mex/data/plugins/CoreMetadataPlugin.py
1,21 → 1,268
# -*- coding: utf-8; tab-width: 4; indent-tabs-mode: t; python-indent: 4 -*-
#!/usr/bin/python
# -*- coding: utf-8 -*-
 
"""
# File: srv_application.py
# File: CoreMetadataPlugin.py
# Author: Tomás Vírseda
# License: GPL v3
# Description: Plugin test
# Description: Get core metadata from a file
"""
 
 
import os
import magic
from stat import ST_SIZE
import html
from gi.repository import Gio
import datetime
from rdflib import Literal
from mex.core.mod_rdf import *
from mex.core.mod_log import get_logger
from mex.services.srv_plugins import IMetadataPlugin
 
 
class CoreMetadataPlugin(IMetadataPlugin):
def print_name(self):
def get_services(self):
self.srvutl = self.app.get_service('Utils')
 
def get_metadata(self, app, path):
self.app = app
self.get_services()
 
metadata = []
self.log = get_logger(__class__.__name__)
self.log.debug ("This is a simple Yapsy plugin")
this_mime = magic.Magic(mime=True)
this_magic = magic.open(magic.MAGIC_MIME_TYPE)
this_magic.load()
 
 
# Get file url
metadata.append((NFO['fileUrl'], Literal(path)))
 
# Get extension
rest, extension = os.path.splitext(path)
ext = (extension[1:]).lower()
if (len(ext) != 0):
metadata.append((NFO['fileExtension'], Literal(ext)))
else:
metadata.append((NFO['fileExtension'], Literal('#noext#')))
 
# Get basename
basename = os.path.basename(rest)
 
# Filename
metadata.append((NFO['fileName'], Literal(basename+extension)))
 
# Get size in bytes
stsize = Literal(os.stat(path)[ST_SIZE])
metadata.append((NFO['fileSize'], Literal(stsize)))
 
t = os.path.getmtime(path)
mdate = self.srvutl.get_timestamp(datetime.datetime.fromtimestamp(t))
#~ self.log.debug("mdate: %s -> %s" %(t, mdate))
metadata.append((NFO['fileLastModified'], Literal(mdate)))
 
# On Unix, time of the last change. On Windows the creation time
t = os.path.getctime(path)
cdate = self.srvutl.get_timestamp(datetime.datetime.fromtimestamp(t))
metadata.append((NFO['fileLastModified'], Literal(cdate)))
 
# last access of path
t = os.path.getatime(path)
adate = self.srvutl.get_timestamp(datetime.datetime.fromtimestamp(t))
metadata.append((NFO['fileLastAccessed'], Literal(adate)))
 
# mimetype
mimetype = this_mime.from_file(path)
 
if mimetype == 'application/octet-stream':
try:
mimetype = this_magic.file(path)
except:
mimetype = 'application/ms-rubbish'
 
if mimetype is None:
mimetype = 'application/octet-stream'
 
if mimetype.startswith('Composite Document'):
mimetype = 'application/ms-rubbish'
 
metadata.append((NIE['mimeType'], Literal(mimetype)))
 
# document type
mtype = mimetype[:mimetype.rfind('/')]
if mtype == "text":
nfotype = "TextDocument"
elif (mtype == "inode"):
nfotype = "Folder"
else:
nfotype = mtype.capitalize()
 
if nfotype == 'Message':
metadata.append((RDF.type, NMO[nfotype]))
else:
metadata.append((RDF.type, NFO[nfotype]))
 
return metadata
 
 
def miner(self, p, o):
try:
o = o.strip()
o = html.escape(o)
if len(o) == 0:
return []
p = p.lower()
#~ self.log.debug("%s -> %s" % (p, o))
metadata = []
#~ self.log.debug("%s - %s(%s)" % (p, type(o), o))
if p in ['title']:
#~ title = o.decode('ascii','xmlcharrefreplace').strip()
title = o.replace('\n', '')
title = title.replace('\t', '')
title = title.replace("<", "&lt;").replace(">", "&gt;").replace("&", "&amp;")
title = title.strip()
metadata.append((NIE['title'], Literal(title)))
elif p in ['creation date', 'create_time']:
cdate = self.utils.timestamp(o)
metadata.append((NFO['fileCreated'], Literal(cdate)))
elif p in ['creator', 'author', 'last saved by', 'artist']:
# Creator of a data object, an entity primarily responsible
#for the creation of the content of the data object.
# Blacklist generators as creators.
generator = False
for creator in self.CREATOR_BLACKLIST:
if creator.lower() in o.lower():
generator = True
 
if generator:
metadata.append((NIE['generator'], Literal(o)))
else:
metadata.append((NCO['creator'], Literal(o.title())))
elif p in ['last_saved_by', 'lastmodifiedby']:
# Creator of a data object, an entity primarily responsible
#for the creation of the content of the data object.
#~ self.log.debug("%s -> %s" % (p, o))
metadata.append((NCO['lastModifiedBy'], Literal(o.title())))
metadata.append((NCO['creator'], Literal(o.title())))
elif p in ['company', 'organization']:
# Creator of a data object, an entity primarily responsible
#for the creation of the content of the data object.
metadata.append((NCO['org'], Literal(o)))
elif p in ['format']:
#~ PDF 1.4
pass
elif p in ['subject', 'description', 'comment']:
metadata.append((NIE['subject'], Literal(o)))
elif p in ['language']:
# Language the InformationElement is expressed in.
# This property applies to the data object in its entirety.
# If the data object is divisible into parts expressed
# in multiple languages - more specific properties should be used.
# Users are encouraged to use the two-letter code specified in the
# RFC 3066
metadata.append((NIE['language'], Literal(o)))
elif p in ['application', 'creating_application', 'generator', 'producer', 'software', 'publisher', 'primary platform']:
metadata.append((NIE['generator'], Literal(o.title())))
elif p in ['codepage', 'codepage_doc']:
metadata.append((NFO['codepage'], Literal(str(o))))
elif p in ['num_chars', 'character count', 'characters']:
metadata.append((NFO['characterCount'], Literal(str(o))))
#~ elif p in ['characterswithspaces']:
#~ metadata.append((NFO['characterWithSpacesCount'], Literal(str(o))))
elif p in ['lines', 'line count']:
metadata.append((NFO['lineCount'], Literal(str(o))))
elif p in ['paragraphs', 'paragraph count']:
metadata.append((NFO['paragraphCount'], Literal(str(o))))
elif p in ['Pages', 'num_pages', 'pages']:
metadata.append((NFO['pageCount'], Literal(str(o))))
elif p in ['num_words', 'word count', 'words']:
metadata.append((NFO['wordCount'], Literal(str(o))))
elif p in ['template']:
metadata.append((NFO['template'], Literal(o)))
#~ elif p in ['creation date']:
#~ metadata.append((NIE['contentCreated'], Literal(o)))
elif p in ['mimetype']:
metadata.append((NIE['mimeType'], Literal(o)))
elif p in ['track number']:
metadata.append((NID3['trackNumber'], Literal(o)))
elif p in ['album']:
metadata.append((NID3['albumTitle'], Literal(o)))
elif p in ['genre', 'content type']:
metadata.append((NID3['genre'], Literal(o)))
elif p in ['year']:
metadata.append((NID3['recordingYear'], Literal(o)))
elif p in ['disc number']:
metadata.append((NID3['discNumber'], Literal(o)))
elif p in ['camera model']:
metadata.append((NEXIF['cameraModel'], Literal(o)))
elif p in ['camera make']:
metadata.append((NEXIF['cameraMaker'], Literal(o)))
elif p in ['aperture']:
metadata.append((NEXIF['apertureValue'], Literal(o)))
elif p in ['exposure']:
metadata.append((NEXIF['exposureValue'], Literal(o)))
elif p in ['exposure bias']:
metadata.append((NEXIF['exposureBiasValue'], Literal(o)))
elif p in ['exposure mode']:
metadata.append((NEXIF['exposureMode'], Literal(o)))
elif p in ['iso speed']:
metadata.append((NEXIF['isoSpeed'], Literal(o)))
elif p in ['focal length']:
metadata.append((NEXIF['focalLength'], Literal(o)))
elif p in ['flash']:
metadata.append((NEXIF['flash'], Literal(o)))
elif p in ['metering mode']:
metadata.append((NEXIF['meteringMode'], Literal(o)))
elif p in ['orientation']:
metadata.append((NEXIF['orientation'], Literal(o)))
elif p in ['format', 'resource-type']:
metadata.append((DC['format'], Literal(o)))
elif p in ['filename']:
metadata.append((NIE['filename'], Literal(o)))
elif p in ['size']:
try:
if 'x' in p:
width = p[:p.find('x')]
height = p[p.find('x') + 1:]
metadata.append((NEXIF['imageWidth'], Literal(str(width))))
metadata.append((NEXIF['imageHeight'], Literal(str(height))))
except: pass
#~ elif p in ['scalecrop']:
#~ # Thumbnail Display Mode.When the object is serialized out as xml
#~ metadata.append((AP['ScaleCrop'], Literal(self.utils.set_boolean(o))))
elif p in ['image size']:
metadata.append((NEXIF['imageSize'], Literal(str(o))))
elif p in ['width', 'image width']:
self.log.debug("%s -> %s" % (NEXIF['width'], Literal(str(o))))
metadata.append((NEXIF['imageWidth'], Literal(str(o))))
elif p in ['height', 'image height']:
metadata.append((NEXIF['imageHeight'], Literal(str(o))))
elif p in ['x resolution']:
metadata.append((NEXIF['xResolution'], Literal(str(o))))
elif p in ['y resolution']:
metadata.append((NEXIF['yResolution'], Literal(str(o))))
elif p in ['bits per sample']:
metadata.append((NEXIF['bitsPerSample'], Literal(str(o))))
elif p in ['y cb cr sub sampling']:
metadata.append((NEXIF['yCbCrSubSampling'], Literal(str(o))))
elif p in ['compression']:
metadata.append((NEXIF['compression'], Literal(str(o))))
elif p in ['image length']:
metadata.append((NEXIF['imageLength'], Literal(str(o))))
elif p in ['device attributes']:
metadata.append((NEXIF['deviceSettingDescription'], Literal(str(o))))
elif p in ['encoding process']:
metadata.append((NEXIF['encodingProcess'], Literal(str(o))))
elif p in ['rendering intent']:
metadata.append((NEXIF['customRendered'], Literal(str(o))))
else:
# Unknow property
# TODO: log entries
pass
 
#~ self.log.debug( "Metadata: %s" % list(metadata) )
return metadata
except Exception as error:
self.log.error(error)
raise
return []
 
/trunk/mex/services/srv_application.py
7,9 → 7,12
# Description: Application service
"""
 
 
import os
import uuid
from mex.core.mod_srv import Service
from mex.core.mod_env import GPATH, LPATH, APP
from mex.core.mod_env import GPATH, LPATH, APP, SEP
from mex.core.mod_rdf import BASE, ConjunctiveGraph, URIRef, NSBINDINGS, Literal
from pprint import pprint
 
 
class MexAppSrv(Service):
29,17 → 32,83
 
 
 
def run(self):
def run(self, params):
self.log.debug("TARGET_PATH: %s" % params.TARGET_PATH)
if params.TARGET_PATH is None:
TARGET_PATH = os.path.join(LPATH['EXPORT'], 'repository.dot')
else:
TARGET_PATH = os.path.join(params.TARGET_PATH, 'repository.dot')
 
repodict = {}
repoobjs = {}
self.log.debug("Starting Metadata eXplorer")
self.srv_plugins = self.get_service('Plugins')
self.plugins = self.srv_plugins.get_plugins_by_category('Metadata')
self.log.debug("Found %d plugins for 'Metadata' category" % len(self.plugins))
for plugin in self.plugins:
self.log.debug("\tPlugin: %s" % plugin.name)
fileset = self.srvutl.get_files_from_dir(GPATH['ROOT'])
self.log.debug(fileset)
fileset = self.srvutl.get_files_from_dir(params.SOURCE_PATH)
self.log.debug("Got %d files" % len(fileset))
 
graph = ConjunctiveGraph()
for ns in NSBINDINGS:
graph.bind(ns, NSBINDINGS[ns])
 
for path in fileset:
tuples = []
s = URIRef("%s%s" % (BASE, path))
rid = uuid.uuid4()
repodict[path] = {}
repodict[path]['uuid'] = str(rid)
for plugin in self.plugins:
try:
metadata = plugin.plugin_object.get_metadata(self.app, path)
except:
metadata = []
tuples.extend(metadata)
for p, o in tuples:
graph.add((s, p, o))
predicate = self.srvutl.pname(p)
if isinstance(o, URIRef) or isinstance(o, Literal):
o = str(self.srvutl.pname(o))
repodict[path][predicate] = o
try:
objid = repoobjs[o]
except:
repoobjs[o] = str(uuid.uuid4())
 
output = "graph graphname{\n"
nodes = []
relations = []
for path in fileset:
try:
node = "\t\"%s\" [shape=cylinder label=\"%s\" URL=\"file://%s\"]\n" % (repodict[path]['uuid'], repodict[path]['fileName'], path)
output += node
nodes.append(node)
for prop in repodict[path]:
if prop not in ['uuid', 'fileName', 'fileUrl']:
pid = repoobjs[repodict[path][prop]]
node = "\t\"%s\" [label=\"%s\"]\n" % (pid, repodict[path][prop])
nodes.append(node)
output += node
relation = "\t\"%s\" -- \"%s\" [label=\"%s\"]\n" % (repodict[path]['uuid'], repoobjs[repodict[path][prop]], prop)
relations.append(relation)
except:
pass
for relation in relations:
output += relation
output += "}"
 
with open(TARGET_PATH, 'w') as dot:
dot.write(output)
 
with open('mygraph.rdf', 'wb') as fout:
fout.write(graph.serialize(format='pretty-xml'))
self.log.debug("Generated dot file in: %s" % TARGET_PATH)
 
 
 
 
 
 
def end(self):
self.log.debug("Stopping Metadata eXplorer")
 
/trunk/mex/services/srv_utils.py
8,6 → 8,7
"""
 
import os
from dateutil import parser as dtuparser
from mex.core.mod_env import GPATH, LPATH, FILE
from mex.core.mod_srv import Service
 
41,4 → 42,16
return fileset
 
 
def get_timestamp(self, adate):
d = dtuparser.parse(str(adate))
return "%4d-%02d-%02d %02d:%02d:%02d" % (d.year, d.month, d.day, d.hour, d.minute, d.second)
 
 
def pname(self, predicate):
p1 = predicate.rfind('/')
p2 = predicate.rfind('#')
if p2 > p1:
bp = predicate[p2 + 1:]
else:
bp = predicate[p1 + 1:]
return bp
/trunk/mex/mex.py
11,6 → 11,7
import sys
import signal
import shutil
import argparse
 
from mex.core.mod_env import APP, LPATH, GPATH, FILE
from mex.core.mod_log import get_logger
21,13 → 22,27
#DOC: http://stackoverflow.com/questions/16410852/keyboard-interrupt-with-with-python-gtk
signal.signal(signal.SIGINT, signal.SIG_DFL)
 
 
class readable_dir(argparse.Action):
# https://stackoverflow.com/questions/11415570/directory-path-types-with-argparse
def __call__(self, parser, namespace, values, option_string=None):
prospective_dir=values
if not os.path.isdir(prospective_dir):
exit("ERROR: '{0}' is not a valid path".format(prospective_dir))
if os.access(prospective_dir, os.R_OK):
setattr(namespace,self.dest,prospective_dir)
else:
exit("ERROR: '{0}' is not a readable dir".format(prospective_dir))
 
 
class MeX(object):
"""
MeX Application class
"""
def __init__(self):
def __init__(self, params):
"""
"""
self.params = params
self.setup_environment()
self.setup_logging()
self.setup_services()
136,14 → 151,23
Start MeX
"""
app = self.get_service('Application')
app.run()
app.run(self.params)
 
 
def parse_args():
parser = argparse.ArgumentParser(description="Metadata Explorer")
parser.add_argument('-sp', '--source-path', dest="SOURCE_PATH", action=readable_dir, help="Souce path of the repository to be analyzed", required=True)
parser.add_argument('-tp', '--target-path', dest="TARGET_PATH", action=readable_dir, help="Target path for output files")
parser.add_argument('-v', '--version', action='version', version='%s %s' % ("MeX", '0.1'))
return parser.parse_args()
 
 
def main():
"""
Entry point
"""
mex = MeX()
params = parse_args()
mex = MeX(params)
mex.run()
mex.stop()
 
/trunk/Changelog
1,3 → 1,8
2019-09-01 Tomás Vírseda <tomasvirseda@gmail.com>
 
* First prototype built successfully
 
 
2019-08-31 Tomás Vírseda <tomasvirseda@gmail.com>
 
* Added Yapsy plugin manager and some plugin examples
/trunk/setup.py
73,7 → 73,10
# distutils does not support install_requires, but pip needs it to be
# able to automatically install dependencies
install_requires=[
'yapsy',
'uuid',
'rdflib',
'yapsy',
'python-dateutil'
],
include_package_data=True,
data_files=data_files,